# This notebook will give a first baseline estimation for the matching of entities via a random forest algorithm as multi-class classification

In [4]:
import os
import pandas as pd
import plotly.express as px
import progressbar
import json
import numpy as np
import nltk
from nltk.corpus import stopwords
import string
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

In [1]:
def remove_stopwords(token_vector, stopwords_list):
    return token_vector.apply(lambda token_list: [word for word in token_list if word not in stopwords_list])

In [6]:
def remove_punctuation(token_vector):
    return token_vector.apply(lambda token_list: [word for word in token_list if word not in string.punctuation])

In [7]:
#get path informationb
product_path = '../../../../src/data/product'
train_test_all_filtered_path = os.path.join(product_path, 'train_test_split/output_unfiltered_tables/large/after_manual_checking')


In [32]:
#fet whole data for baseline
df_train= pd.read_json(os.path.join(train_test_all_filtered_path,'train/concatenated_data/train_all_filtered_tables.json.gz'), compression='gzip', lines=True)
df_test= pd.read_json(os.path.join(train_test_all_filtered_path,'test/concatenated_data/test_all_filtered_tables.json.gz'), compression='gzip', lines=True)
df_val= pd.read_json(os.path.join(train_test_all_filtered_path,'val/concatenated_data/val_all_filtered_tables.json.gz'), compression='gzip', lines=True)

In [33]:
ids = set(df_val.cluster_id) & set(df_test.cluster_id) & set(df_train.cluster_id)

In [34]:
df_val = df_val[df_val['cluster_id'].isin(ids)]
df_test = df_test[df_test['cluster_id'].isin(ids)]
df_train = df_train[df_train['cluster_id'].isin(ids)]

In [35]:
df_whole = pd.concat([df_train, df_test,df_val]).reset_index()

### Combine tf-idf and tf vector based features

In [36]:
#filter down the dataframe
df_whole = df_whole[['name','description','cluster_id','table_id']]

In [37]:
# fill emtpy values for description and the concat name and description
df_whole.description.fillna(value=',', inplace=True)
df_whole['concat_information']=df_whole['name'].astype(str)#+df_whole['description'].astype(str)
df_whole.drop(columns=['name','description'],inplace= True)

In [38]:
#clean concated description column to use tf-idf 
df_whole['concat_information'] = df_whole['concat_information'].apply(lambda row: row.lower())
df_whole['tokens'] = df_whole['concat_information'].apply(lambda row: word_tokenize(row))
df_whole['tokens'] = remove_stopwords(df_whole['tokens'],stopwords.words())
df_whole['tokens'] = remove_punctuation (df_whole['tokens'])
df_whole.drop(columns=['concat_information'],inplace=True)

In [39]:
#define vectorizer to match preprocessed tokes for term frequency
def dummy(doc):
    return doc

vectorizer  = CountVectorizer(
        tokenizer=dummy,
        preprocessor=dummy,
        max_features=15000)  
tf_value = vectorizer.fit_transform(df_whole['tokens'])

In [40]:
#define vectorizer to match preprocessed tokes
def dummy_fun(doc):
    return doc

tfidf = TfidfVectorizer(
    analyzer='word',
    tokenizer=dummy_fun,
    preprocessor=dummy_fun,
    token_pattern=None,
    max_features=15000)  
tfidf_value = tfidf.fit_transform(df_whole['tokens'])

In [41]:
df_tf = pd.DataFrame(tf_value.toarray(), columns=vectorizer.get_feature_names())
df_tfidf = pd.DataFrame(tfidf_value.toarray(), columns=tfidf.get_feature_names())
df_prepared = pd.concat([df_whole, df_tfidf, df_tf], axis=1)

In [42]:
train = df_prepared[~df_prepared['table_id'].isin(df_test['table_id'].drop_duplicates().to_list())].reset_index()
df_target_train = train['cluster_id']
train.drop(columns=['cluster_id','table_id','tokens'],inplace= True)

In [43]:
test = df_prepared[df_prepared['table_id'].isin(df_test['table_id'].drop_duplicates().to_list())].reset_index()
df_target_test = test['cluster_id']
test.drop(columns=['cluster_id','table_id','tokens'],inplace= True)

In [44]:
# Baseline random forest
rf = RandomForestClassifier()
rf.fit(train,df_target_train)
prediction = rf.predict(test) 
f1_mic = f1_score(df_target_test,prediction,average='micro')
f1_mac = f1_score(df_target_test,prediction,average='macro')
accuracy = accuracy_score(df_target_test,prediction) 
precision = precision_score(df_target_test,prediction,average='micro') 
recall = recall_score(df_target_test,prediction,average='micro') 
precision_mac = precision_score(df_target_test,prediction,average='macro') 
recall_mac = recall_score(df_target_test,prediction,average='macro') 
print("The F1-Score micro on test set: {:.4f}".format(f1_mic))
print("The F1-Score macro on test set: {:.4f}".format(f1_mac))
print("The Precision on test set: {:.4f}".format(precision))
print("The Recall on test set: {:.4f}".format(recall))
print("The Precision macro on test set: {:.4f}".format(precision_mac))
print("The Recall macro on test set: {:.4f}".format(recall_mac))
print("The Accuracy-Score on test set: {:.4f}".format(accuracy))

The F1-Score micro on test set: 0.8684
The F1-Score macro on test set: 0.7826
The Precision on test set: 0.8684
The Recall on test set: 0.8684
The Precision macro on test set: 0.8109
The Recall macro on test set: 0.7973
The Accuracy-Score on test set: 0.8684


  _warn_prf(average, modifier, msg_start, len(result))
