In [1]:
import pandas as pd
import numpy as np
import torch
import pickle
from torch.utils.data import TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
from util import preprocess_data, compute_metrics, foresee

SEED = 2137
PATH_FOR_MODEL = '../models/auto_model/days_name_desc_5_ml100'
EPOCHS = 5

data = pd.read_csv('../data/recruitment_data_modified_python.csv',
                   encoding='utf-8',
                   sep=',',
                   on_bad_lines='skip',
                   quotechar='"',
                   doublequote=True,
                   names=['URL', 'Voivodeship', 'Scrap_time', 'Name', 'Price', 'Brand', 'Condition', 'Offer_from', 'Type', 'Description', 'Added_at', 'Views', 'User_since'],
                   skiprows=1)

preprocessed_data = preprocess_data(data)

Skipping line 346: expected 13 fields, saw 14
Skipping line 367: expected 13 fields, saw 15
Skipping line 466: expected 13 fields, saw 19
Skipping line 467: expected 13 fields, saw 19
Skipping line 494: expected 13 fields, saw 15
Skipping line 615: expected 13 fields, saw 15
Skipping line 616: expected 13 fields, saw 15
Skipping line 617: expected 13 fields, saw 15
Skipping line 618: expected 13 fields, saw 15
Skipping line 619: expected 13 fields, saw 15
Skipping line 620: expected 13 fields, saw 15
Skipping line 712: expected 13 fields, saw 14
Skipping line 713: expected 13 fields, saw 14
Skipping line 739: expected 13 fields, saw 14
Skipping line 747: expected 13 fields, saw 16
Skipping line 867: expected 13 fields, saw 15
Skipping line 956: expected 13 fields, saw 14
Skipping line 1028: expected 13 fields, saw 14
Skipping line 1234: expected 13 fields, saw 15
Skipping line 1282: expected 13 fields, saw 17
Skipping line 1326: expected 13 fields, saw 14
Skipping line 1327: expected 1

In [2]:
preprocessed_data

Unnamed: 0,Price,Days_passed_name,Days_passed_name_desc
0,2799.0,52 days iphone 11 64 jak nowy 95% gwarancja wy...,52 days iphone 11 64 jak nowy 95% gwarancja wy...
1,2700.0,"51 days iphone 11 64 gb czarny, idealny z gwar...","51 days iphone 11 64 gb czarny, idealny z gwar..."
2,2899.0,51 days jak nowy apple iphone 11 256gbgb white...,51 days jak nowy apple iphone 11 256gbgb white...
3,2500.0,51 days apple iphone 11 biały 64gb - jak nowy ...,51 days apple iphone 11 biały 64gb - jak nowy ...
4,2150.0,51 days iphone 11 64 gb + gwarancja,"51 days iphone 11 64 gb + gwarancja witam, mam..."
...,...,...,...
2667,2299.0,51 days iphone 11 black 64gb,51 days iphone 11 black 64gb sprzedam iphone 1...
2668,1900.0,51 days i phone 11 64 gb cena tylko dzis,51 days i phone 11 64 gb cena tylko dzis cena ...
2669,2800.0,"51 days iphone 11 128 gb gwarancja , 100% bat...","51 days iphone 11 128 gb gwarancja , 100% bat..."
2670,1650.0,50 days iphone 11 white 64gb,50 days iphone 11 white 64gb na sprzedaż posia...


In [3]:
preprocessed_data['Days_passed_name'].tolist()

['52 days iphone 11 64 jak nowy 95% gwarancja wyświetlacz',
 '51 days iphone 11 64 gb czarny, idealny z gwarancją. wymiana',
 '51 days jak nowy apple iphone 11 256gbgb white gwarancja',
 '51 days apple iphone 11 biały 64gb - jak nowy gwarancja paragon 4xetui',
 '51 days iphone 11 64 gb + gwarancja',
 '51 days iphone 11 64 gb + gwarancja',
 '51 days iphone 11 w bardzo dobrym stanie',
 '51 days iphone 11 * idealny stan * 100% bateria * gwarancja * zamiana *',
 '51 days iphone 11 128 gb stan idealny etui gratis',
 '52 days iphone 11 64 gb prawie nowy super zestaw',
 '52 days iphone 11 czarny 128 gb',
 '51 days oryginalny | apple iphone 11 64/128gb | różne kolory | rok gwarancji',
 '51 days sprzedam iphone 11 128gb red 23msc gwarancji',
 '52 days iphone 11 64gb fioletowy perfekcyjny purple',
 '51 days apple iphone 11 64gb red poznań długa 14',
 '51 days iphone 11 ideał gwarancja 256gb',
 '51 days telefon iphone se (2020) gw do 11.09.2021',
 '51 days iphone  11 ( rezerwacja )',
 '51 days ip

In [None]:
preprocessed_data['Days_passed_name'].apply(len).mean()

In [11]:
preprocessed_data['Days_passed_name_desc'].apply(len).mean() # too many characters to process, needs truncation

577.8701347305389

In [2]:
model = AutoModelForSequenceClassification.from_pretrained('dkleczek/bert-base-polish-uncased-v1', num_labels = 1).to("cpu")

Some weights of the model checkpoint at dkleczek/bert-base-polish-uncased-v1 were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassificatio

In [3]:
tokenizer = AutoTokenizer.from_pretrained('dkleczek/bert-base-polish-uncased-v1')

scaler = StandardScaler()
y_dataset = scaler.fit_transform(np.asarray(preprocessed_data['Price']).reshape(-1,1))

X_train, X_test, y_train, y_test = train_test_split(preprocessed_data['Days_passed_name_desc'].tolist(), y_dataset, train_size=0.8)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, train_size=0.8)

train_encodings = tokenizer(X_train, truncation=True, padding=True, max_length=100)
valid_encodings = tokenizer(X_val, truncation=True, padding=True, max_length=100)
test_encodings = tokenizer(X_test, truncation=True, padding=True, max_length=100)

class MakeTorchData(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor([self.labels[idx]])
        item["labels"] = float(item["labels"])
        return item

    def __len__(self):
        return len(self.labels)

# convert our tokenized data into a torch Dataset
train_dataset = MakeTorchData(train_encodings, y_train.ravel())
valid_dataset = MakeTorchData(valid_encodings, y_val.ravel())
test_dataset = MakeTorchData(test_encodings, y_test.ravel())

In [4]:
# Specify the arguments for the trainer
training_args = TrainingArguments(
    output_dir = PATH_FOR_MODEL + '/checkpoints',
    num_train_epochs = EPOCHS,
    per_device_train_batch_size = 64,
    per_device_eval_batch_size = 20,
    weight_decay = 0.01,
    learning_rate = 2e-5,
    logging_dir = '../logs',
    save_total_limit = 10,
    load_best_model_at_end = True,
    metric_for_best_model = 'rmse',
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
)

# Call the Trainer
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = train_dataset,
    eval_dataset = valid_dataset,
    compute_metrics = compute_metrics,
)

# Train the model
trainer.train()

# Call the summary
trainer.evaluate()

***** Running training *****
  Num examples = 1709
  Num Epochs = 5
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 135
  Number of trainable parameters = 132122113


Epoch,Training Loss,Validation Loss


***** Running Evaluation *****
  Num examples = 428
  Batch size = 20
Saving model checkpoint to ../models/auto_model/days_name_desc_5_ml200/checkpoints\checkpoint-27
Configuration saved in ../models/auto_model/days_name_desc_5_ml200/checkpoints\checkpoint-27\config.json
Model weights saved in ../models/auto_model/days_name_desc_5_ml200/checkpoints\checkpoint-27\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 428
  Batch size = 20
Saving model checkpoint to ../models/auto_model/days_name_desc_5_ml200/checkpoints\checkpoint-54
Configuration saved in ../models/auto_model/days_name_desc_5_ml200/checkpoints\checkpoint-54\config.json
Model weights saved in ../models/auto_model/days_name_desc_5_ml200/checkpoints\checkpoint-54\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 428
  Batch size = 20
Saving model checkpoint to ../models/auto_model/days_name_desc_5_ml200/checkpoints\checkpoint-81
Configuration saved in ../models/auto_model/days_name_desc_5_ml200/

{'eval_loss': 0.5977652072906494,
 'eval_mse': 0.5977652072906494,
 'eval_rmse': 0.773152768611908,
 'eval_mae': 0.5427095890045166,
 'eval_r2': 0.5095653745059423,
 'eval_runtime': 83.8109,
 'eval_samples_per_second': 5.107,
 'eval_steps_per_second': 0.262,
 'epoch': 5.0}

In [5]:
trainer.save_model(PATH_FOR_MODEL)
pickle.dump(scaler, open(PATH_FOR_MODEL + '/scaler.pkl','wb'))

Saving model checkpoint to ../models/auto_model/days_name_desc_5_ml200
Configuration saved in ../models/auto_model/days_name_desc_5_ml200\config.json
Model weights saved in ../models/auto_model/days_name_desc_5_ml200\pytorch_model.bin


In [6]:
predictions = trainer.predict(test_dataset)

***** Running Prediction *****
  Num examples = 535
  Batch size = 20


In [7]:
inversed_predictions = scaler.inverse_transform(np.asarray(predictions[0]).reshape(-1,1))
inversed_test = scaler.inverse_transform(np.asarray(predictions[1]).reshape(-1, 1))
mean_squared_error(inversed_test, inversed_predictions, squared=False)

402.1609

In [8]:
inversed_test - inversed_predictions

array([[-8.61447754e+01],
       [ 3.11942383e+02],
       [ 6.90458984e+01],
       [ 2.80180664e+01],
       [-2.69350586e+02],
       [-1.99197021e+02],
       [-1.42909180e+02],
       [ 2.58658447e+02],
       [-2.88127441e+02],
       [ 7.14162598e+01],
       [ 7.86094727e+02],
       [ 1.28016357e+02],
       [ 2.25211426e+02],
       [ 4.71099121e+02],
       [-4.44996582e+02],
       [ 6.67950928e+02],
       [ 7.23645020e+01],
       [-2.73258545e+02],
       [ 1.72204028e+03],
       [-1.15820068e+02],
       [ 2.50641846e+02],
       [ 3.67878174e+02],
       [ 4.73636963e+02],
       [ 6.65746582e+02],
       [ 5.94321289e+01],
       [ 5.41000244e+02],
       [-7.69043701e+02],
       [-2.73112305e+02],
       [ 7.87102051e+01],
       [ 8.71835938e+01],
       [ 2.69414062e+01],
       [ 2.01917480e+02],
       [ 7.12998047e+01],
       [-2.39421387e+01],
       [-5.67323975e+02],
       [ 1.44366455e+02],
       [ 2.37097900e+02],
       [ 1.09754395e+02],
       [ 3.5

In [9]:
extrapolated_x = []
extrapolated_y = []
for i in range(51, 85):
    extrapolated_x.append(str(i) + ' days' + ' iphone 11')
    extrapolated_y.append(0)

In [10]:
extrapolated_x

['51 days iphone 11',
 '52 days iphone 11',
 '53 days iphone 11',
 '54 days iphone 11',
 '55 days iphone 11',
 '56 days iphone 11',
 '57 days iphone 11',
 '58 days iphone 11',
 '59 days iphone 11',
 '60 days iphone 11',
 '61 days iphone 11',
 '62 days iphone 11',
 '63 days iphone 11',
 '64 days iphone 11',
 '65 days iphone 11',
 '66 days iphone 11',
 '67 days iphone 11',
 '68 days iphone 11',
 '69 days iphone 11',
 '70 days iphone 11',
 '71 days iphone 11',
 '72 days iphone 11',
 '73 days iphone 11',
 '74 days iphone 11',
 '75 days iphone 11',
 '76 days iphone 11',
 '77 days iphone 11',
 '78 days iphone 11',
 '79 days iphone 11',
 '80 days iphone 11',
 '81 days iphone 11',
 '82 days iphone 11',
 '83 days iphone 11',
 '84 days iphone 11']

In [11]:
extrapolated_tokens = tokenizer(extrapolated_x, truncation=True, padding=True, max_length=50)

extrapolated_dataset = MakeTorchData(extrapolated_tokens, np.asarray(extrapolated_y).ravel())

In [12]:
extrapolated_predictions = trainer.predict(extrapolated_dataset)

***** Running Prediction *****
  Num examples = 34
  Batch size = 20


In [13]:
inversed_extrapolated = scaler.inverse_transform(np.asarray(extrapolated_predictions[0]).reshape(-1, 1))
type(inversed_extrapolated)

numpy.ndarray

In [17]:
# def foresee(model: Trainer,
#             scaler: StandardScaler,
#             tokenizer: AutoTokenizer,
#             text: str = 'iphone 11',
#             days: tuple = (1, 90)) -> np.ndarray:
#
#     extrapolated_x = []
#     extrapolated_y = []
#     for i in range(*days):
#         extrapolated_x.append(str(i) + ' days ' + text)
#         extrapolated_y.append(0)
#
#     extrapolated_tokens = tokenizer(extrapolated_x, truncation=True, padding=True, max_length=50)
#     extrapolated_dataset = MakeTorchData(extrapolated_tokens, np.asarray(extrapolated_y).ravel())
#     extrapolated_predictions = model.predict(extrapolated_dataset)
#     inversed_extrapolated = scaler.inverse_transform(np.asarray(extrapolated_predictions[0]).reshape(-1, 1))
#
#     return inversed_extrapolated

In [14]:
foresee(trainer, scaler, tokenizer, 'iphone 11 64')

***** Running Prediction *****
  Num examples = 89
  Batch size = 20


array([[2421.4336],
       [2471.8577],
       [2424.6587],
       [2443.429 ],
       [2434.9146],
       [2411.59  ],
       [2434.2788],
       [2433.5447],
       [2392.3652],
       [2382.0662],
       [2382.2007],
       [2361.1692],
       [2383.7761],
       [2398.1406],
       [2387.7197],
       [2381.7756],
       [2401.7253],
       [2385.581 ],
       [2402.6443],
       [2380.987 ],
       [2385.4197],
       [2404.4084],
       [2407.9583],
       [2388.5217],
       [2410.954 ],
       [2385.3472],
       [2390.5469],
       [2397.107 ],
       [2379.9167],
       [2374.3323],
       [2392.5962],
       [2380.804 ],
       [2383.851 ],
       [2382.87  ],
       [2376.0005],
       [2365.8708],
       [2379.5488],
       [2368.78  ],
       [2356.585 ],
       [2376.3508],
       [2375.6414],
       [2405.5293],
       [2387.1875],
       [2376.4343],
       [2378.241 ],
       [2381.0713],
       [2371.701 ],
       [2356.8418],
       [2407.3577],
       [2384.8042],


In [19]:
# pickle.dump(scaler, open(PATH_FOR_MODEL + '/scaler.pkl','wb'))
# scaler = pickle.load(open('../results/scaler_auto_model.pkl','rb'))