In [1]:
# Necessary imports and libraries
import csv
import math
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.utils import check_random_state

In [2]:
# Definition of auxiliar methods and variables
actual_price = {}
first_price = {}
# Complete the prices with the sample with the same "id"
def __price_completer(row):
    precio = float(row['precio'].replace(',', '.')) if isinstance(row['precio'], str) else row['precio']
    identificador = str(row['id'])
    if math.isnan(precio):
        precio = actual_price.get(identificador, -1.0)
    else:
        actual_price[identificador] = precio
        if first_price.get(identificador) == None:
            first_price[identificador] = precio
    return precio

# Complete the price by proximity
def __price_completer_proximity(row):
    precio = row['precio']
    if precio == -1.0:
        identificador = str(row['id'])
        precio = first_price.get(identificador)
    return precio

# Calculate favorable cases
def __favorable_cases(test, pred):
    rotura = 0
    total = len(test)
    for i in range(total):
        if pred[i] < test[i]:
            rotura += 1

    return (total - rotura) / total

# Calculate datathon metric
def __datathon_metric(y_test, y_train, pred):
    rrmse = math.sqrt(mean_squared_error(
        y_test.values, pred)) / y_train.mean()
    cf = __favorable_cases(y_test.values, pred)
    return (0.7 * rrmse) + (0.3 * (1 - cf))

# Convert YYYY-MM-DD to DD-MM-YYYY
def __datetime_parser(datetime):
    datetime = datetime.split('-')
    return datetime[2] + '-' + datetime[1] + '-' + datetime[0]

In [3]:
# Method to preprocesing the input files
def input_parser(path, option):
    df = pd.read_csv(filepath_or_buffer=path, sep='|')
    # There are duplicated samples
    df.drop_duplicates(inplace=True)
    if option != 'base':
        # Drop the useless hour of 'fecha' column
        df['fecha'] = df['fecha'].apply(lambda x: x.replace(' 0:00:00', ''))
        # Completation of 'precio' column
        df['precio'] = df.apply(__price_completer, axis=1)
        df['precio'] = df.apply(__price_completer_proximity, axis=1)
        # Split of 'fecha' column
        df['fecha'] = pd.to_datetime(df['fecha'])
        df['dia'] = pd.DatetimeIndex(df['fecha']).day
        df['mes'] = pd.DatetimeIndex(df['fecha']).month
        df['anyo'] = pd.DatetimeIndex(df['fecha']).year
        df.drop('fecha', axis=1, inplace=True)

        # One-hot encoding of 'estado'
        df = pd.concat([df, pd.get_dummies(pd.get_dummies(
            df['estado'], prefix='estado'))], axis=1).drop(['estado'], axis=1)

        # One-hot encoding of 'categoria_uno'
        df = pd.concat([df, pd.get_dummies(pd.get_dummies(
            df['categoria_uno'], prefix='categoria_uno'))], axis=1).drop(['categoria_uno'], axis=1)

        # One-hot encoding of 'dia_atipico'
        df = pd.concat([df, pd.get_dummies(pd.get_dummies(
            df['dia_atipico'], prefix='dia_atipico'))], axis=1).drop(['dia_atipico'], axis=1)

        df['antiguedad'].fillna(0, inplace=True)

        if option == 'drop':
            # 'categoria_dos' drop
            df.drop('categoria_dos', axis=1, inplace=True)
        else:
            # 'categoria_dos' corrupted samples correction
            df['categoria_dos'] = df['categoria_dos'].apply(
                lambda x: 0 if math.isnan(x) else x)

    df.to_csv(index=False, path_or_buf=path.replace(
        '.txt', '') + "_" + option + ".csv", sep='|')

In [4]:
# "Modelar" and "Estimar" dataframes
input_parser("data\Modelar_UH2021.txt", 'drop')
modelar = pd.read_csv(r'data/Modelar_UH2021_drop.csv', sep='|', low_memory=False)
# "Estimar" dataframe has not samples with "estado" = "Rotura" and that column is
# converted to one-hot vector so it must be dropped
modelar = modelar.drop('estado_Rotura', axis=1)
input_parser(r"data\Estimar_UH2021.txt", 'drop')
estimar = pd.read_csv(r'data/Estimar_UH2021_drop.csv', sep='|', low_memory=False)
input_parser(r"data\Estimar_UH2021.txt", 'base')
estimar_data = pd.read_csv(r'data/Estimar_UH2021_base.csv', sep='|', low_memory=False)

  if (await self.run_code(code, result,  async_=asy)):


In [5]:
# Train and evaluate the model
# Random state seed, to recreate our results
rng = 42
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
            modelar.loc[:, modelar.columns != 'unidades_vendidas'], modelar['unidades_vendidas'], test_size=0.2, random_state=rng)

# Model training
model = RandomForestRegressor(n_estimators=150, min_samples_split=3, min_samples_leaf=2, n_jobs=-1, random_state=rng)
model.fit(X_train, y_train)

# Model evaluation
pred = model.predict(X_test)
pred = list(map(lambda x: round(x), pred))
cf = __favorable_cases(y_test.values, pred)
metric = __datathon_metric(y_test, y_train, pred)

print('MSE: {}'.format(mean_squared_error(y_test, pred)))
print('MAE: {}'.format(mean_absolute_error(y_test, pred)))
print('Favorable cases: {}'.format(cf))
print('Datathon metric: {}'.format(metric))

MSE: 106.45139065900669
MAE: 2.680797925530872
Favorable cases: 0.812618870218231
Datathon metric: 2.006255856266547


In [8]:
# Finally, the generation of response file
with open('res/Nevermore.txt', 'w') as csv_file:
    estimar_prediction = model.predict(estimar)
    csv_file.write('FECHA|ID|UNIDADES\n')
    i = 0
    for est_pred in estimar_prediction:
        csv_file.write("{}|{}|{}\n".format(estimar_data.iloc[i]['fecha'], int(estimar.iloc[i]['id']), round(est_pred)))
        i += 1