In [97]:
#! pip install imblearn
#! pip install xgboost

#from imblearn.over_sampling import RandomOverSampler
import seaborn as sns
from sklearn.metrics import confusion_matrix
import zipfile
from zipfile import ZipFile
from pathlib import Path
from typing import Dict, Optional, List, Sequence, Tuple, Any
import pandas as pd
from pandas import DataFrame
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ParameterGrid
from sklearn.ensemble import RandomForestClassifier
#from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
from docx import Document
import matplotlib.pyplot as plt
import dill as pkl  # dill is used because pickle cannot handle lambda functions
import pickle
from datetime import date
from pathlib import Path
import zipfile
from zipfile import ZipFile
import dill as pkl
import os
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt
%matplotlib inline

TODAY = date.today().strftime("%Y%m%d")
CLIENT = "Uniper"
MIN_NUM_OF_SAMPLES = 5
COUNTRY = 'ALL'
SAVE_CLFS = True
RES_DIR = Path("./retraining_october21/")
SCAN_ID_COL = "filename"  # document identifier col used when reducing global df to relevant examples for attribute

In [98]:
if not os.path.exists(RES_DIR):
    os.makedirs(RES_DIR)

In [99]:
def reduce_to_relevant(df: DataFrame, col: str, min_num_samples: int) -> DataFrame:
    # find rows for values that appear at least min_num_samples times
    relevant = [x for x in df[col].value_counts().index
                if df[col].value_counts()[x] >= min_num_samples]
    # create boolean mask
    mask = [(x in relevant) for x in df[col]]
    print(
        f"Reduced to {len(df[mask])} samples from {len(relevant)} relevant classes. (N={min_num_samples})"
    )
    return df[mask]


def get_reduced_df(
        df: DataFrame,
        feature_col: str,
        scan_id_col: str,
        min_num_samples: int) -> DataFrame:
    # keep only documents with unambiguous value for this col
    df_ = df.drop_duplicates(subset=[scan_id_col, feature_col])\
            .groupby(scan_id_col)\
            .filter(lambda x: len(x) == 1)

    return reduce_to_relevant(df_, feature_col, min_num_samples)


def split_for_target_col(df, col, test_size=0.2, random_state=42):
    return train_test_split(
        df,
        df[col],
        test_size=test_size,
        random_state=random_state
    )


def split_for_target_col_stratified(df, col, test_size=0.2, random_state=42,):
    return train_test_split(df,
        df[col],
        test_size=test_size,
        random_state=random_state,
        stratify=df[col]
    )


def _get_top_n_results_with_confs(
        clazzes: Sequence[str],
        probs: List[float],
        n: int = 1) -> List[Tuple[str, float]]:
    return sorted(
        zip(clazzes, probs),
        key=lambda x: x[1],
        reverse=True
    )[:n]


def get_results_for_target(
        target_clf: Any,
        df: DataFrame) -> List[Tuple[str, float]]:
    probs = target_clf.predict_proba(df)
    clazzes = target_clf.classes_
    results = []
    for prob_list in probs:
        results.append(_get_top_n_results_with_confs(clazzes, prob_list)[0])
    return results

In [4]:
%load_ext autoreload
%autoreload 2

In [100]:
tday = date.today().strftime("%A %d. %B %Y") 
print(f"Models trained on {tday} with package versions: \n")
print(f"scikit-learn: {sklearn.__version__}")
print(f"dill: {pkl.__version__}")

Models trained on Tuesday 25. January 2022 with package versions: 

scikit-learn: 0.24.2
dill: 0.3.4


In [101]:
# dict used to enrich ground truth later
country_to_country_group = {
    "DE": 'de',
    "SE": 'se',
    "AT": 'at',
    "GB": 'uk',
    "BE": 'ubx',
    "NL": 'ubx',
    "LU": 'ubx'   
}

In [102]:
df_ground_truth = pd.read_excel("C:/Users/runyaoyu/Desktop/Project1101/Sebastian/RetrainingOctober2021/Daten/Uniper_GT_09_21.xlsx") #Uniper_GT_09_21.xlsx
df_ground_truth.dropna(subset=['gl_document_scan_id'], inplace=True)
df_ground_truth["gl_document_scan_id"] = df_ground_truth["gl_document_scan_id"].apply(lambda x: x.lower())


In [103]:
with open("C:/Users/runyaoyu/Desktop/Project1101/Sebastian/RetrainingOctober2021/Daten/texts_all_rt202110.pkl", "rb") as file:
    df_lume = pickle.load(file)

df_merged = df_lume.merge(df_ground_truth, left_on=["filename"], right_on=["gl_document_scan_id"], how="inner")
df_merged["country"] = df_merged['le_country_id'].apply(lambda x: country_to_country_group[x])
df_merged["country"].value_counts()


de     47313
se     36580
ubx     4684
uk      3829
at      2429
Name: country, dtype: int64

In [104]:
df_lume = df_merged
print(df_lume.shape, "\n")

for country in ["de", "at", "uk", "se", "ubx"]:
    with open(f"C:/Users/runyaoyu/Desktop/Project1101/Sebastian/RetrainingOctober2021/Daten/ocr_text_df_{country}.pkl", 'rb') as file:  
        df_lume_country = pickle.load(file)
    df_lume_country["country"] = country
    print(f"shape {country}: {df_lume_country.shape}\n")
    df_lume = pd.concat([df_lume, df_lume_country], ignore_index=True)

(94835, 57) 

shape de: (23618, 3)

shape at: (478, 3)

shape uk: (7865, 3)

shape se: (22014, 3)

shape ubx: (4629, 3)



In [105]:
df_lume = df_lume[["filename", "text", "country", "gl_posting_id"]]
df_lume.head()

# some cleaning
df_lume["filename"] = df_lume["filename"].apply(lambda filename: filename.lower())
df_lume = df_lume[df_lume["text"] != 0]
df_lume = df_lume[df_lume["text"] != '']
df_lume["text"] = df_lume["text"].apply(lambda text: str(text).replace("\r\n", " ").replace("\n", " ").lower())
df_lume.dropna(subset=['text'], inplace=True)
df_lume.drop_duplicates(inplace=True)

In [106]:
def get_certain_class_after_vec_country(df_lume, label):
    label = label
    df_attr = get_reduced_df(df_lume, label, SCAN_ID_COL, MIN_NUM_OF_SAMPLES)
    x_train, x_test, y_train, y_test = split_for_target_col_stratified(df_attr, label)
    vectorizer = TfidfVectorizer(max_features=20000, max_df=0.75, sublinear_tf=True,)
    X_train = x_train['text']
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test = x_test['text']
    X_test_vec = vectorizer.transform(X_test)
    return df_attr, X_train, X_test, y_train, y_test, X_train_vec, X_test_vec

In [107]:
# country
label = 'country'
df_attr, X_train, X_test, y_train, y_test, X_train_vec, X_test_vec = get_certain_class_after_vec_country(df_lume, label)

Reduced to 89503 samples from 5 relevant classes. (N=5)


In [108]:
df_attr.sample(10)

Unnamed: 0,filename,text,country,gl_posting_id
140713,ecb1d7761cc01ee7bc8efa9318d5b84a,1800305sebfd01m03788 u. faktura sida 1 av 1 5...,se,
24371,000d3a2c37f81eea88cdd6d13234d72d,$726 1_314577_2 e10 b27v56 l wo <bas zebi_n...,de,40.0
11947,000d3a2c37f81eeabb81cd063e30e60d,faktura - —— db > ı 8 169 04 solna nr 9652851...,se,40.0
146777,fc15b423ffe81ee8998fa426e4e7b315,e10 c.. | 'ei leveransadress uniper gastur...,se,
120251,9cb65498ce781ee898f70478d513de3c,"alstom power ltd. lichfield road stafford, st1...",uk,
44316,b05adabc4b681ee9b0c6850b9907627e,il | 75300003056240 dds conferencing & cater...,de,40.0
68913,ecb1d7761cc01ed99fa12e46bc71a4cf,1914204sebfdo1m43810 39 = » en io ». db- ii t...,se,40.0
33222,9cb65498ce781ed988f1b72530973b19,heavy haul power international hh pi gmbh. st...,de,50.0
136693,b05adabc4b681ee8b38e331ef60a64b5,1828305sebfdo1m20260 i mannheimer swartling ...,se,
29040,000d3a2c37f81eeab5a967d104da39d0,sixt:pn electronically signed on 03.08.2020 00...,de,40.0


In [109]:
print(X_train.shape, X_test.shape)
print(X_train_vec.shape, X_test_vec.shape)
#print(X_train_vec)

(71602,) (17901,)
(71602, 20000) (17901, 20000)


In [207]:
from keras_preprocessing.text import Tokenizer

tokenizers_new = Tokenizer(num_words=20000, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n')
tokenizers_new.fit_on_texts(df_attr['text'].values)
word_index = tokenizers_new.word_index
token_id = tokenizers_new.texts_to_sequences(df_attr['text'].values)

In [184]:
token_id[0]

303

In [209]:
import numpy as np
import random
import json
import copy
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import glob
from sklearn.preprocessing import LabelEncoder

labels_temp = df_attr[label]
le = LabelEncoder()
encoded_labels = le.fit_transform(labels_temp)

meta = []

for i in range(len(token_id)):
    df_temp = {'id': encoded_labels[i], 'token': token_id[i]}
    meta.append(df_temp)


In [210]:
train_data, valid_data = train_test_split(meta, train_size = 0.8, random_state = 1)

In [211]:
class Text_Dataset(Dataset):
    def __init__(self, data):
        self.data = data
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        sample = self.data[idx]
        sample['token'] = torch.Tensor(sample['token'])
        return sample

In [212]:
train_dataset = Text_Dataset(train_data)
valid_dataset = Text_Dataset(valid_data)

train_dataloader = DataLoader(train_dataset, batch_size=1, shuffle=True)
valid_dataloader = DataLoader(valid_dataset, batch_size=1, shuffle=True)

In [213]:
sample = next(iter(train_dataloader))

print('Sample from train dataloader: ')
print('USER ID: ', sample['id'])
print('TOKEN ID: ', sample['token'])
print('TOKEN ID shape should be BATCH by LENGTH: ', sample['token'].shape)

Sample from train dataloader: 
USER ID:  tensor([1], dtype=torch.int32)
TOKEN ID:  tensor([[4.6700e+02, 6.6000e+02, 5.5880e+03, 1.1260e+03, 2.5000e+01, 1.9900e+02,
         1.3700e+02, 1.9570e+03, 1.7000e+01, 1.9900e+02, 1.3700e+02, 2.9390e+03,
         3.7500e+02, 1.7400e+02, 1.1050e+03, 7.3360e+03, 2.6000e+01, 1.4900e+02,
         1.7000e+01, 7.6800e+02, 1.0112e+04, 7.0710e+03, 2.5640e+03, 1.9070e+03,
         6.9990e+03, 6.7100e+02, 5.0400e+02, 4.0000e+01, 3.2310e+03, 6.7000e+01,
         1.1641e+04, 1.2600e+02, 7.8000e+01, 2.5600e+02, 5.9300e+03, 7.8000e+01,
         2.5600e+02, 2.4640e+03, 6.7000e+01, 6.8200e+02, 5.8550e+03, 9.0000e+00,
         5.0390e+03, 5.3000e+02, 2.2280e+03, 7.6710e+03, 1.6250e+03, 7.1490e+03,
         8.0500e+02, 6.6760e+03, 3.8080e+03, 8.1810e+03, 2.0530e+03, 2.9670e+03,
         2.3000e+01, 5.5000e+01, 7.0000e+00, 4.5700e+03, 7.5900e+02, 1.5700e+02,
         5.8230e+03, 7.5900e+02, 1.5234e+04, 1.5700e+02, 1.1059e+04, 3.8740e+03,
         1.5080e+03, 1.635

In [214]:
class Model(nn.Module):
    def __init__(self, num_token, num_user, embed_dim, rnn_dim, num_layers):
        super(Model, self).__init__()
        self.num_token = num_token
        self.num_user = num_user
        self.embed_dim = embed_dim
        self.rnn_dim = rnn_dim
        self.num_layers = num_layers
        
        self.embedding = nn.Embedding(num_token, embed_dim)
        self.rnn = nn.GRU(embed_dim, rnn_dim, num_layers=num_layers, batch_first=True)
        self.out_linear = nn.Linear(rnn_dim, num_user)
        
        self.dropout = nn.Dropout(p=0.2)

    def forward(self, token_id):
        embed = self.embedding(token_id)
        embed = self.dropout(embed)
        out, _ = self.rnn(embed)
        out = self.dropout(out)
        return self.out_linear(out[:, -1])

In [215]:
#device = 'cuda'
device = 'cpu'

model = Model(num_token=len(word_index), num_user=5, embed_dim=512, rnn_dim=1024, num_layers=1).to(device)
optimizer = optim.AdamW(model.parameters(), lr=1e-5, weight_decay=1e-7)

In [216]:
num_param = sum(p.numel() for p in model.parameters())
print('Number of parameters: {}'.format(num_param))
pred = model(sample['token'].long().to(device))
print('Prediction shape would be BATCH X NUM_USER(OUTPUT) : ', pred.shape)

Number of parameters: 707470341
Prediction shape would be BATCH X NUM_USER(OUTPUT) :  torch.Size([1, 5])


In [None]:
criteria = nn.CrossEntropyLoss()
avg_loss = 0.0
best_valid_accu = 0.0
best_epoch = -1
best_model = None
num_epoch = 30
x,y = [],[]

for epoch in range(num_epoch):
    # start training
    for sample in train_dataloader:
        model.train()
        optimizer.zero_grad()

        pred = model(sample['token'].long().to(device))

        loss = criteria(pred, sample['id'].long().to(device))

        loss.backward()
        optimizer.step()

        avg_loss += loss.item() / len(train_dataloader)

    # start validation
    correct_cnt = 0.0
    data_cnt = 0.0
    for sample in valid_dataloader:
        model.eval()

        with torch.no_grad():
            pred = model(sample['token'].long().to(device))

        pred_user_id = torch.argmax(pred, dim=-1)

        accu = pred_user_id.detach().cpu() == sample['id']

        correct_cnt += torch.sum(accu)
        data_cnt += sample['token'].shape[0]

    # calculate best valid accuracy, and save the best model. 
    curr_valid_accu = (correct_cnt / data_cnt).item()
    print('[EPOCH {}] VALID ACCURACY: {}'.format(epoch, curr_valid_accu))
    x.append(epoch)
    y.append(curr_valid_accu)

    best_valid_accu = max(best_valid_accu, curr_valid_accu)
    if best_valid_accu == curr_valid_accu:
        best_epoch = epoch
        best_model = copy.deepcopy(model)
        torch.save(best_model.state_dict(), 'GRU_best_baseline.pth')
        print('[EPOCH {}] BEST VALID ACCURACY UPDATED: {}'.format(epoch, best_valid_accu))

print('FINISHED TRAINING : BEST MODEL AT EPOCH {} WITH ACCURACY {}'.format(best_epoch, best_valid_accu))

In [51]:
df_lume['gl_posting_id'].value_counts(dropna=False)

40.0    61677
NaN     58383
50.0     5581
70.0        5
91.0        4
80.0        1
Name: gl_posting_id, dtype: int64

In [52]:
df_lume.dropna(subset=['gl_posting_id'], inplace=True)
df_lume["gl_posting_id"] = df_lume["gl_posting_id"].apply(lambda x: int(x))

In [53]:
def get_certain_class_after_vec_gl_posting(df_lume, label):
    label = label
    df_attr = get_reduced_df(df_lume, label, SCAN_ID_COL, min_num_samples=6)
    x_train, x_test, y_train, y_test = split_for_target_col_stratified(df_attr, label)
    vectorizer = TfidfVectorizer(max_features=20000, max_df=0.75, sublinear_tf=True,)
    X_train = x_train['text']
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test = x_test['text']
    X_test_vec = vectorizer.transform(X_test)
    return x_train, x_test, y_train, y_test, X_train_vec, X_test_vec

In [54]:
label = 'gl_posting_id'
x_train, x_test, y_train, y_test, X_train_vec, X_test_vec = get_certain_class_after_vec_gl_posting(df_lume, label)

Reduced to 62601 samples from 2 relevant classes. (N=6)
