In [21]:
import pandas as pd

In [22]:
class_list = ['NOT_TRIP', 'TRIP']

In [23]:
df = pd.read_csv("datasets/bert_ds.csv", delimiter=';')

In [24]:
del df['origin']
del df['destination']
del df['detours']
df.drop(columns=df.columns[0], axis=1, inplace=True)

In [25]:
df['pred_class'] = df.apply(lambda x: 1 if x["is_trip"] == 1 else 0,axis=1)
del df['is_trip']

In [None]:
df.head()

In [None]:
from sklearn.model_selection import train_test_split
 
train_df, test_df = train_test_split(df, test_size=0.10)
 
print('train shape: ',train_df.shape)
print('test shape: ',test_df.shape)

In [8]:
from simpletransformers.classification import ClassificationModel

In [29]:
train_args ={
    "reprocess_input_data": True,
    "fp16":False,
    "num_train_epochs": 4
}

In [None]:
model = ClassificationModel(
    "bert", "dbmdz/bert-base-french-europeana-cased",
    num_labels=2,
    args=train_args,
    use_cuda=False,
)

In [None]:
model.train_model(train_df)

In [None]:
from sklearn.metrics import f1_score, accuracy_score
 
def f1_multiclass(labels, preds):
    return f1_score(labels, preds, average='micro')
 
result, model_outputs, wrong_predictions = model.eval_model(test_df, f1=f1_multiclass, acc=accuracy_score)

In [None]:
result

In [None]:
test = "j'aimerais aller a strasbourg depuis paris"
predictions, raw_outputs = model.predict([test])
print(class_list[predictions[0]])

In [9]:
import os
import tarfile

In [10]:
def pack_model(model_path='',file_name=''):
  files = [files for root, dirs, files in os.walk(model_path)][0]
  with tarfile.open(file_name+ '.tar.gz', 'w:gz') as f:
    for file in files:
      f.add(f'{model_path}/{file}')

In [11]:
pack_model('outputs', 'bert-fr-trip')

In [81]:
ds_val = pd.read_csv("datasets/bert_ds_val.csv", delimiter=';')

del ds_val['origin']
del ds_val['destination']
del ds_val['detours']
ds_val.drop(columns=ds_val.columns[0], axis=1, inplace=True)

ds_val['pred_class'] = ds_val.apply(lambda x: 1 if x["is_trip"] == 1 else 0,axis=1)
del ds_val['is_trip']

In [83]:
ds_val = ds_val.sample(frac=1)

In [None]:
ds_val

In [90]:
result = 0

In [None]:
for _, row in ds_val.iterrows():
    text = row['text']
    real_val = row['pred_class']
    predictions, _ = model.predict([text])
    print(f'text: {text}')
    print(f'\tPREDICTED: {class_list[predictions[0]]}')
    print(f'\tREAL: {class_list[real_val]}')
    if(class_list[predictions[0]] == class_list[real_val]):
        result += 1

In [None]:
print(f'Number of rows in val: {len(ds_val)}')
print(f'Accuracy: {(result * 100) / len(ds_val):.2f}%')