In [12]:
from pathlib import Path

import numpy as np
import pandas as pd

In [13]:
from typing import Final

SEED: Final[int] = 42

In [14]:
def preprocess(data: pd.DataFrame) -> pd.DataFrame:
    x = data.copy(deep=True)
    x['Rating'] = x['Rating'].str.replace(',', '.').astype(float)
    x['Price diff'] = x["Max price"] - x["Min price"]
    x = x.drop(columns=["full_category",
                        "Color",
                        "Max price",
                        "Min price",
                        "Average price",
                        "Brand",
                        "Seller",
                        "Name",
                        "Category",
                        ])
    return x

In [15]:
data_dir = Path("../data")
train_csv = pd.read_csv(data_dir / "train.csv")

In [16]:
train_data = preprocess(data=train_csv)
train_data = train_data.dropna(axis='index')

In [17]:
from sklearn.ensemble import RandomForestRegressor

test_csv_file = pd.read_csv(data_dir / "test.csv")
id_column = test_csv_file.Id.tolist()
test_csv_file.drop(columns=['Id'], inplace=True)
test_unit = preprocess(test_csv_file)
train_unit = train_data.copy(deep=True).drop(columns=['Sales'])
estimator = RandomForestRegressor(n_estimators=100, n_jobs=-1, verbose=True, random_state=42)
estimator.fit(X=train_unit, y=train_data.Sales)
results = estimator.predict(X=test_unit)
submit_csv_file = pd.DataFrame({'Id': id_column, 'Expected': np.round(results)})
submit_csv_file

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    2.4s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.1s finished


Unnamed: 0,Id,Expected
0,0,41.0
1,1,24.0
2,2,47.0
3,3,0.0
4,4,6.0
...,...,...
10736,10736,3.0
10737,10737,29.0
10738,10738,148.0
10739,10739,2.0


In [18]:
submit_csv_file.to_csv('../submissions/baseline.csv', index=False)