# This notebook will give a first baseline estimation for the matching of entities via a random forest algorithm as multi-class classification

In [1]:
import os
import pandas as pd
import plotly.express as px
import progressbar
import json
import numpy as np
import nltk
from nltk.corpus import stopwords
import string
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

In [2]:
def remove_stopwords(token_vector, stopwords_list):
    return token_vector.apply(lambda token_list: [word for word in token_list if word not in stopwords_list])

In [3]:
def remove_punctuation(token_vector):
    return token_vector.apply(lambda token_list: [word for word in token_list if word not in string.punctuation])

In [8]:
#get path informationb
product_path = '../../../../src/data/product'
train_test_all_filtered_path = os.path.join(product_path, 'train_test_split/output_unfiltered_tables/large/after_manual_checking')


In [9]:
#fet whole data for baseline
all_filtered_tables_df = pd.read_json(os.path.join(train_test_all_filtered_path,'train/concatenated_data/train_all_filtered_tables.json.gz'), compression='gzip', lines=True)

In [10]:
all_filtered_tables_df

Unnamed: 0,row_id,name,offers,description,sku,page_url,cluster_id,table_id,brand_x,brand_y,...,position,product-sku,product-name,ratingcount,breadcrumb,inlanguage,ispartof,speakable,availability,pricevaliduntil
0,0,mason pearson brush.,"{'pricecurrency': 'USD', 'price': '175.00', 'i...",A handcrafted hairbrush from Mason Pearson wit...,210000000083,https://www.malinandgoetz.com/mason-pearson-brush,1617106,Product_malinandgoetz.com_September2020.json.gz,,,...,,,,,,,,,,
1,35,rum body lotion.,"{'price': '38.00', 'availability': 'http://sch...",a lightweight rum-scented body lotion that abs...,210000000448,https://eu.malinandgoetz.com/rum-body-lotion-8...,1257145,Product_malinandgoetz.com_September2020.json.gz,,,...,,,,,,,,,,
2,323,Cisco Catalyst WS-C2960X-24TD-L Stackable Rack...,"{'price': '0.00', 'pricecurrency': 'GBP', 'ite...",,WS-C2960X-24TD-L,https://www.odsi.co.uk/ws-c2960x-24td-l.php,758825,Product_odsi.co.uk_September2020.json.gz,Cisco,cisco,...,,,,,,,,,,
3,25,Laurel Heights Charcoal Crest LH99,"{'pricecurrency': 'USD', 'price': '0', 'availa...",Marchand's Interior & Hardware in Gonzales has...,,https://www.marchands.net/american-olean-laure...,18329550,Product_marchands.net_September2020.json.gz,American Olean,american olean,...,,,,,,,,,,
4,58,Laurel Heights Gray Summit LH98,"{'pricecurrency': 'USD', 'price': '0', 'availa...",Marchand's Interior & Hardware in Gonzales has...,,https://www.marchands.net/american-olean-laure...,52605388,Product_marchands.net_September2020.json.gz,American Olean,american olean,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11605,724,Anthelios Xl Spf50 Gel-crema Toque Seco Color ...,"{'offercount': '20', 'pricecurrency': 'EUR', '...",Anthelios xl spf50 gelcrema toque seco color 5...,1047.0,https://www.boticas23.com/anthelios-xl-spf50-g...,1937007,Product_boticas23.com_September2020.json.gz,La Roche-Posay,,...,,,,,,,,,,
11606,755,Vichy neovadiol crema piel seca 50ml,"{'offercount': '3', 'price': '23.99', 'availab...",Vichy neovadiol crema piel seca 50 ml NEOVADIO...,2931.0,https://www.boticas23.com/vichy-neovadiol-crem...,33370,Product_boticas23.com_September2020.json.gz,Vichy,,...,,,,,,,,,,
11607,779,Anthelios Xl Leche Spf50+ 250ml,"{'pricecurrency': 'EUR', 'price': '16.99', 'hi...",ANTHELIOS XL SPF 50 LECHE ATERCIOPELADA 250ml ...,1032.0,https://www.boticas23.com/anthelios-xl-leche-s...,708960,Product_boticas23.com_September2020.json.gz,La Roche-Posay,,...,,,,,,,,,,
11608,827,Autohelios gel autobronceador 100ml,"{'highprice': '11.95', 'offercount': '2', 'pri...",AUTOHELIOS GELLECHE Autobronceador hidratante ...,1039.0,https://www.boticas23.com/autohelios-gel-autob...,62047,Product_boticas23.com_September2020.json.gz,La Roche-Posay,,...,,,,,,,,,,


In [11]:
# get information about train and test table split
product_path = '../../../../src/data/product'
zip_files_train = [file for file in os.listdir(os.path.join(train_test_all_filtered_path, 'train')) if file.endswith('.json.gz')]
zip_files_val = [file for file in os.listdir(os.path.join(train_test_all_filtered_path, 'val')) if file.endswith('.json.gz')]
zip_files_test = [file for file in os.listdir(os.path.join(train_test_all_filtered_path, 'test')) if file.endswith('.json.gz')]

In [7]:
all_filtered_tables_df

Unnamed: 0,row_id,name,offers,description,sku,brand,page_url,cluster_id,table_id,brand_x,...,video,position,ratingcount,target,breadcrumb,ispartof,speakable,title,isconsumablefor,productbrand
0,399,WD 2TB Elements Portable External Hard Drive -...,"{'pricecurrency': 'USD', 'price': '74.99', 'sk...",Brand: Western Digital Color: black Features: ...,B06W55K9N6,{'name': 'Western Digital'},https://area399.com/products/wd-2tb-elements-p...,541658,Product_area399.com_September2020.json.gz,,...,,,,,,,,,,
1,18,Three Gifts Goat Milk Soap,"{'price': '8.5', 'availability': 'https://sche...","Gold, frankincense and myrrh - three gifts giv...",,,https://www.thecraftist.com/product-page/three...,65383188,Product_thecraftist.com_September2020.json.gz,,...,,,,,,,,,,
2,601,GoPro Head Strap + Quick Clip,"{'pricecurrency': 'CAD', 'availability': 'http...",,,GoPro,https://www.specialisteduski.com/gopro-head-st...,863679,Product_specialisteduski.com_September2020.jso...,GoPro,...,,,,,,,,,,
3,722,GoPro 3 way grip Arm GoPro,"{'price': '102.99', 'pricecurrency': 'CAD', 'a...",,,GoPro,https://www.specialisteduski.com/gopro-3-way-g...,251884,Product_specialisteduski.com_September2020.jso...,GoPro,...,,,,,,,,,,
4,0,mason pearson brush.,"{'pricecurrency': 'USD', 'price': '175.00', 'i...",A handcrafted hairbrush from Mason Pearson wit...,210000000083,,https://www.malinandgoetz.com/mason-pearson-brush,1617106,Product_malinandgoetz.com_September2020.json.gz,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31287,834,Gibson Les Paul Joe Bonamassa Goldtop,"{'price': '2599.00', 'pricevaliduntil': '2021-...",,210000000448.0,,https://sixtysixtysounds.com/product/gibson-le...,1257145,Product_sixtysixtysounds.com_September2020.jso...,,...,,,,,,,,,,
31288,854,Gibson Birdseye Les Paul 1959 Cherry,"{'price': '0.00', 'pricevaliduntil': '2021-12-...",,210000000722.0,,https://sixtysixtysounds.com/product/gibson-bi...,1524820,Product_sixtysixtysounds.com_September2020.jso...,,...,,,,,,,,,,
31289,442,Epiphone Masterbilt AJ 500 RCE,"{'pricevaliduntil': '2021-12-31', 'price': '44...",,210000000077.0,,https://sixtysixtysounds.com/product/epiphone-...,1702641,Product_sixtysixtysounds.com_September2020.jso...,,...,,,,,,,,,,
31290,1074,Essência White Beer 500g,"{'price': 'R$ 92,00'}",,,,https://www.lojapeterpaiva.com.br/produto/esse...,2434654,Product_lojapeterpaiva.com.br_September2020.js...,,...,,,,,,,,,,Peter Paiva


In [8]:
all_filtered_tables_df['dataset']=''

In [9]:
all_filtered_tables_df.loc[all_filtered_tables_df['table_id'].isin(zip_files_train),'dataset']='train'

In [10]:
all_filtered_tables_df.loc[all_filtered_tables_df['table_id'].isin(zip_files_val),'dataset']='val'

In [11]:
all_filtered_tables_df.loc[all_filtered_tables_df['table_id'].isin(zip_files_test),'dataset']='test'

In [12]:
all_filtered_tables_df

Unnamed: 0,row_id,name,offers,description,sku,brand,page_url,cluster_id,table_id,brand_x,...,position,ratingcount,target,breadcrumb,ispartof,speakable,title,isconsumablefor,productbrand,dataset
0,399,WD 2TB Elements Portable External Hard Drive -...,"{'pricecurrency': 'USD', 'price': '74.99', 'sk...",Brand: Western Digital Color: black Features: ...,B06W55K9N6,{'name': 'Western Digital'},https://area399.com/products/wd-2tb-elements-p...,541658,Product_area399.com_September2020.json.gz,,...,,,,,,,,,,test
1,18,Three Gifts Goat Milk Soap,"{'price': '8.5', 'availability': 'https://sche...","Gold, frankincense and myrrh - three gifts giv...",,,https://www.thecraftist.com/product-page/three...,65383188,Product_thecraftist.com_September2020.json.gz,,...,,,,,,,,,,test
2,601,GoPro Head Strap + Quick Clip,"{'pricecurrency': 'CAD', 'availability': 'http...",,,GoPro,https://www.specialisteduski.com/gopro-head-st...,863679,Product_specialisteduski.com_September2020.jso...,GoPro,...,,,,,,,,,,test
3,722,GoPro 3 way grip Arm GoPro,"{'price': '102.99', 'pricecurrency': 'CAD', 'a...",,,GoPro,https://www.specialisteduski.com/gopro-3-way-g...,251884,Product_specialisteduski.com_September2020.jso...,GoPro,...,,,,,,,,,,test
4,0,mason pearson brush.,"{'pricecurrency': 'USD', 'price': '175.00', 'i...",A handcrafted hairbrush from Mason Pearson wit...,210000000083,,https://www.malinandgoetz.com/mason-pearson-brush,1617106,Product_malinandgoetz.com_September2020.json.gz,,...,,,,,,,,,,train
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31287,834,Gibson Les Paul Joe Bonamassa Goldtop,"{'price': '2599.00', 'pricevaliduntil': '2021-...",,210000000448.0,,https://sixtysixtysounds.com/product/gibson-le...,1257145,Product_sixtysixtysounds.com_September2020.jso...,,...,,,,,,,,,,test
31288,854,Gibson Birdseye Les Paul 1959 Cherry,"{'price': '0.00', 'pricevaliduntil': '2021-12-...",,210000000722.0,,https://sixtysixtysounds.com/product/gibson-bi...,1524820,Product_sixtysixtysounds.com_September2020.jso...,,...,,,,,,,,,,test
31289,442,Epiphone Masterbilt AJ 500 RCE,"{'pricevaliduntil': '2021-12-31', 'price': '44...",,210000000077.0,,https://sixtysixtysounds.com/product/epiphone-...,1702641,Product_sixtysixtysounds.com_September2020.jso...,,...,,,,,,,,,,test
31290,1074,Essência White Beer 500g,"{'price': 'R$ 92,00'}",,,,https://www.lojapeterpaiva.com.br/produto/esse...,2434654,Product_lojapeterpaiva.com.br_September2020.js...,,...,,,,,,,,,Peter Paiva,val


In [13]:
#filter down the dataframe
df_whole = all_filtered_tables_df[['row_id','name','description','cluster_id','table_id','dataset']]

In [14]:
df_whole.sort_values(by=['cluster_id','dataset'])

Unnamed: 0,row_id,name,description,cluster_id,table_id,dataset
189,236,Begg x Co Kishorn Scarf: Flannel Grey,Woven in Begg x Co's historic Ayrshire mill on...,985,Product_trunkclothiers.com_September2020.json.gz,test
1237,731,Canon EOS REBEL T7i Body,EOS Rebel T7i The EOS Rebel T7i camera has pro...,985,Product_discountshop.com_September2020.json.gz,test
6360,592,Canon EOS Rebel T7i,<p>Packing a versatile feature-set in a sleek ...,985,Product_prophotosupply.com_September2020.json.gz,test
7708,142,Cakebread Reserve Chardonnay 2017,"Seductive, slightly floral aromas of creamy go...",985,Product_ocwinemart.com_September2020.json.gz,test
5671,342,Canon EOS Rebel T7i DSLR Camera Body,Packing a versatile feature-set in a sleek and...,985,Product_cameramall.com_September2020.json.gz,train
...,...,...,...,...,...,...
13379,14,Xiaomi Redmi Note 7 Pro 6.3 inch 6GB RAM 128GB...,Xiaomi Redmi Note 7 Pro CPU: Snapdragon 675 Oc...,80168995,Product_store-singapore.com_September2020.json.gz,train
15654,6,Xiaomi Redmi Note 7 Pro 6.3 inch 6GB RAM 128GB...,Xiaomi Redmi Note 7 Pro CPU: Snapdragon 675 Oc...,80168995,Product_store-belgie.com_September2020.json.gz,train
20242,25,Xiaomi Redmi Note 7 Pro 6.3 inch 6GB RAM 128GB...,Xiaomi Redmi Note 7 Pro CPU: Snapdragon 675 Oc...,80168995,Product_storegermany.com_September2020.json.gz,train
7488,1111,Xiaomi Redmi Note 7 Pro 6.3 inch 6GB RAM 128GB...,Xiaomi Redmi Note 7 Pro CPU: Snapdragon 675 Oc...,80168995,Product_iwebshop.com_September2020.json.gz,val


In [15]:
df_whole.sort_values(by=['cluster_id','dataset']).to_excel('Test_Data_Check_1.xlsx')

### Only use idf-tf vector based features

In [8]:
# fill emtpy values for description and the concat name and description
df_whole.description.fillna(value=',', inplace=True)
df_whole['concat_information']=df_whole['name'].astype(str)+df_whole['description'].astype(str)
df_whole.drop(columns=['name','description'],inplace= True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().fillna(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_whole['concat_information']=df_whole['name'].astype(str)+df_whole['description'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [9]:
#clean concated description column to use tf-idf 
df_whole['concat_information'] = df_whole['concat_information'].apply(lambda row: row.lower())
df_whole['tokens'] = df_whole['concat_information'].apply(lambda row: word_tokenize(row))
df_whole['tokens'] = remove_stopwords(df_whole['tokens'],stopwords.words())
df_whole['tokens'] = remove_punctuation (df_whole['tokens'])
df_whole.drop(columns=['concat_information'],inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_whole['concat_information'] = df_whole['concat_information'].apply(lambda row: row.lower())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_whole['tokens'] = df_whole['concat_information'].apply(lambda row: word_tokenize(row))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_whole['tokens'] =

In [10]:
#define vectorizer to match preprocessed tokes
def dummy_fun(doc):
    return doc

tfidf = TfidfVectorizer(
    analyzer='word',
    tokenizer=dummy_fun,
    preprocessor=dummy_fun,
    token_pattern=None, 
    max_features=15000)  
tfidf_value = tfidf.fit_transform(df_whole['tokens'])

In [11]:
df_tfidf = pd.DataFrame(tfidf_value.toarray(), columns=tfidf.get_feature_names())
df_prepared = pd.concat([df_whole, df_tfidf], axis=1)

In [12]:
df_train = df_prepared[df_prepared['table_id'].isin(zip_files_train)].reset_index()
df_target_train = df_train['cluster_id']
df_train.drop(columns=['cluster_id','table_id','tokens'],inplace= True)

In [13]:
df_test = df_prepared[df_prepared['table_id'].isin(zip_files_test)].reset_index()
df_target_test = df_test['cluster_id']
df_test.drop(columns=['cluster_id','table_id','tokens'],inplace= True)

In [14]:
# Baseline random forest
rf = RandomForestClassifier()
rf.fit(df_train,df_target_train)
prediction = rf.predict(df_test) 
f1_mic = f1_score(df_target_test,prediction,average='micro')
f1_mac = f1_score(df_target_test,prediction,average='macro')
accuracy = accuracy_score(df_target_test,prediction) 
precision = precision_score(df_target_test,prediction,average='micro') 
recall = recall_score(df_target_test,prediction,average='micro') 
precision_mac = precision_score(df_target_test,prediction,average='macro') 
recall_mac = recall_score(df_target_test,prediction,average='macro') 
print("The F1-Score micro on test set: {:.4f}".format(f1_mic))
print("The F1-Score on test set: {:.4f}".format(f1_mac))
print("The Precision on test set: {:.4f}".format(precision))
print("The Recall on test set: {:.4f}".format(recall))
print("The Precision macro on test set: {:.4f}".format(precision_mac))
print("The Recall macro on test set: {:.4f}".format(recall_mac))
print("The Accuracy-Score on test set: {:.4f}".format(accuracy))

The F1-Score micro on test set: 0.7717
The F1-Score on test set: 0.6596
The Precision on test set: 0.7717
The Recall on test set: 0.7717
The Precision macro on test set: 0.7026
The Recall macro on test set: 0.6725
The Accuracy-Score on test set: 0.7717


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Only use tf vector based features

In [15]:
#filter down the dataframe
df_whole = all_filtered_tables_df[['name','description','cluster_id','table_id']]

In [16]:
# fill emtpy values for description and the concat name and description
df_whole.description.fillna(value=',', inplace=True)
df_whole['concat_information']=df_whole['name'].astype(str)#+df_whole['description'].astype(str)
df_whole.drop(columns=['name','description'],inplace= True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().fillna(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_whole['concat_information']=df_whole['name'].astype(str)#+df_whole['description'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [17]:
#clean concated description column to use tf-idf 
df_whole['concat_information'] = df_whole['concat_information'].apply(lambda row: row.lower())
df_whole['tokens'] = df_whole['concat_information'].apply(lambda row: word_tokenize(row))
df_whole['tokens'] = remove_stopwords(df_whole['tokens'],stopwords.words())
df_whole['tokens'] = remove_punctuation (df_whole['tokens'])
df_whole.drop(columns=['concat_information'],inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_whole['concat_information'] = df_whole['concat_information'].apply(lambda row: row.lower())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_whole['tokens'] = df_whole['concat_information'].apply(lambda row: word_tokenize(row))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_whole['tokens'] =

In [18]:
#define vectorizer to match preprocessed tokes for term frequency
def dummy(doc):
    return doc

vectorizer  = CountVectorizer(
        tokenizer=dummy,
        preprocessor=dummy,
        max_features=15000)  
tf_value = vectorizer.fit_transform(df_whole['tokens'])

In [19]:
df_tf = pd.DataFrame(tf_value.toarray(), columns=vectorizer.get_feature_names())
df_prepared = pd.concat([df_whole, df_tf], axis=1)

In [20]:
df_train = df_prepared[df_prepared['table_id'].isin(zip_files_train)].reset_index()
df_target_train = df_train['cluster_id']
df_train.drop(columns=['cluster_id','table_id','tokens'],inplace= True)

In [21]:
df_test = df_prepared[df_prepared['table_id'].isin(zip_files_test)].reset_index()
df_target_test = df_test['cluster_id']
df_test.drop(columns=['cluster_id','table_id','tokens'],inplace= True)

In [22]:
# Baseline random forest
rf = RandomForestClassifier()
rf.fit(df_train,df_target_train)
prediction = rf.predict(df_test) 
f1_mic = f1_score(df_target_test,prediction,average='micro')
f1_mac = f1_score(df_target_test,prediction,average='macro')
accuracy = accuracy_score(df_target_test,prediction) 
precision = precision_score(df_target_test,prediction,average='micro') 
recall = recall_score(df_target_test,prediction,average='micro') 
precision_mac = precision_score(df_target_test,prediction,average='macro') 
recall_mac = recall_score(df_target_test,prediction,average='macro') 
print("The F1-Score micro on test set: {:.4f}".format(f1_mic))
print("The F1-Score macro on test set: {:.4f}".format(f1_mac))
print("The Precision on test set: {:.4f}".format(precision))
print("The Recall on test set: {:.4f}".format(recall))
print("The Precision macro on test set: {:.4f}".format(precision_mac))
print("The Recall macro on test set: {:.4f}".format(recall_mac))
print("The Accuracy-Score on test set: {:.4f}".format(accuracy))

The F1-Score micro on test set: 0.8122
The F1-Score macro on test set: 0.7295
The Precision on test set: 0.8122
The Recall on test set: 0.8122
The Precision macro on test set: 0.7751
The Recall macro on test set: 0.7365
The Accuracy-Score on test set: 0.8122


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Combine tf-idf and tf vector based features

In [23]:
#filter down the dataframe
df_whole = all_filtered_tables_df[['name','description','cluster_id','table_id']]

In [24]:
# fill emtpy values for description and the concat name and description
df_whole.description.fillna(value=',', inplace=True)
df_whole['concat_information']=df_whole['name'].astype(str)#+df_whole['description'].astype(str)
df_whole.drop(columns=['name','description'],inplace= True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().fillna(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_whole['concat_information']=df_whole['name'].astype(str)#+df_whole['description'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [25]:
#clean concated description column to use tf-idf 
df_whole['concat_information'] = df_whole['concat_information'].apply(lambda row: row.lower())
df_whole['tokens'] = df_whole['concat_information'].apply(lambda row: word_tokenize(row))
df_whole['tokens'] = remove_stopwords(df_whole['tokens'],stopwords.words())
df_whole['tokens'] = remove_punctuation (df_whole['tokens'])
df_whole.drop(columns=['concat_information'],inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_whole['concat_information'] = df_whole['concat_information'].apply(lambda row: row.lower())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_whole['tokens'] = df_whole['concat_information'].apply(lambda row: word_tokenize(row))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_whole['tokens'] =

In [26]:
#define vectorizer to match preprocessed tokes for term frequency
def dummy(doc):
    return doc

vectorizer  = CountVectorizer(
        tokenizer=dummy,
        preprocessor=dummy,
        max_features=15000)  
tf_value = vectorizer.fit_transform(df_whole['tokens'])

In [27]:
#define vectorizer to match preprocessed tokes
def dummy_fun(doc):
    return doc

tfidf = TfidfVectorizer(
    analyzer='word',
    tokenizer=dummy_fun,
    preprocessor=dummy_fun,
    token_pattern=None,
    max_features=15000)  
tfidf_value = tfidf.fit_transform(df_whole['tokens'])

In [28]:
df_tf = pd.DataFrame(tf_value.toarray(), columns=vectorizer.get_feature_names())
df_tfidf = pd.DataFrame(tfidf_value.toarray(), columns=tfidf.get_feature_names())
df_prepared = pd.concat([df_whole, df_tfidf, df_tf], axis=1)

In [29]:
df_train = df_prepared[df_prepared['table_id'].isin(zip_files_train)].reset_index()
df_target_train = df_train['cluster_id']
df_train.drop(columns=['cluster_id','table_id','tokens'],inplace= True)

In [30]:
df_test = df_prepared[df_prepared['table_id'].isin(zip_files_test)].reset_index()
df_target_test = df_test['cluster_id']
df_test.drop(columns=['cluster_id','table_id','tokens'],inplace= True)

In [31]:
# Baseline random forest
rf = RandomForestClassifier()
rf.fit(df_train,df_target_train)
prediction = rf.predict(df_test) 
f1_mic = f1_score(df_target_test,prediction,average='micro')
f1_mac = f1_score(df_target_test,prediction,average='macro')
accuracy = accuracy_score(df_target_test,prediction) 
precision = precision_score(df_target_test,prediction,average='micro') 
recall = recall_score(df_target_test,prediction,average='micro') 
precision_mac = precision_score(df_target_test,prediction,average='macro') 
recall_mac = recall_score(df_target_test,prediction,average='macro') 
print("The F1-Score micro on test set: {:.4f}".format(f1_mic))
print("The F1-Score macro on test set: {:.4f}".format(f1_mac))
print("The Precision on test set: {:.4f}".format(precision))
print("The Recall on test set: {:.4f}".format(recall))
print("The Precision macro on test set: {:.4f}".format(precision_mac))
print("The Recall macro on test set: {:.4f}".format(recall_mac))
print("The Accuracy-Score on test set: {:.4f}".format(accuracy))

The F1-Score micro on test set: 0.8163
The F1-Score macro on test set: 0.7417
The Precision on test set: 0.8163
The Recall on test set: 0.8163
The Precision macro on test set: 0.7873
The Recall macro on test set: 0.7462
The Accuracy-Score on test set: 0.8163


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
