# This notebook will give a first baseline estimation for the matching of entities via a random forest algorithm as multi-class classification

In [1]:
import os
import pandas as pd
import gzip
import json
import numpy as np
import nltk
from nltk.corpus import stopwords
import string
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

In [2]:
trainPath = r'../../../src/data/LocalBusiness/Splitting_12.20/Train_Test/train tables' + '/'
testPath = r'../../../src/data/LocalBusiness/Splitting_12.20/Train_Test/test tables' + '/'
trainTables = os.listdir(trainPath)
testTables = os.listdir(testPath)
LBData = []

for table in trainTables:
    if table != '.ipynb_checkpoints':
        with gzip.open(trainPath + table, 'r') as dataFile:
            for line in dataFile:
                lineData = json.loads(line.decode('utf-8'))
                lineData['origin'] = table
                LBData.append(lineData)
trainData = pd.DataFrame(LBData)

LBData = []
for table in testTables:
    if table != '.ipynb_checkpoints':
        with gzip.open(testPath + table, 'r') as dataFile:
            for line in dataFile:
                lineData = json.loads(line.decode('utf-8'))
                lineData['origin'] = table
                LBData.append(lineData)
testData = pd.DataFrame(LBData)   

In [3]:
columns = ['name']
trainData['concat'] = trainData[columns].astype(str).agg(' '.join, axis=1)
testData['concat'] = testData[columns].astype(str).agg(' '.join, axis=1)
trainData = trainData[['concat', 'cluster_id', 'origin']]
testData = testData[['concat', 'cluster_id', 'origin']]
trainData = trainData.loc[trainData['cluster_id'] > -1]
testData = testData.loc[testData['cluster_id'] > -1]

In [4]:
frames = [trainData, testData]
allData = pd.concat(frames)
allData['cluster_id_mapped'] = allData.groupby('cluster_id').ngroup()

trainData = allData.loc[allData['origin'].isin(trainData['origin'])]
testData = allData.loc[~allData['origin'].isin(trainData['origin'])]

In [5]:
trainData.to_csv(r'../../../src/data/LocalBusiness/Splitting_12.20/Train_Test/train.csv')
testData.to_csv(r'../../../src/data/LocalBusiness/Splitting_12.20/Train_Test/test.csv')

In [6]:
def remove_stopwords(token_vector, stopwords_list):
    return token_vector.apply(lambda token_list: [word for word in token_list if word not in stopwords_list])

In [7]:
def remove_punctuation(token_vector):
    return token_vector.apply(lambda token_list: [word for word in token_list if word not in string.punctuation])

In [7]:
#clusters = data.groupby(['telephoneNorm']).size().reset_index(name='counts').sort_values('counts', ascending=False)
#clusters = clusters.loc[clusters['counts'] > 1]
#clusteredData = data[data['telephoneNorm'].isin(clusters['telephoneNorm'])]
#clusteredData['ClusterID'] = clusteredData.groupby('telephoneNorm').ngroup()
#columns = ['name', 'addressregion', 'streetaddress', 'addresslocality', 'addresscountry', 'longitude', 'latitude']
#clusteredData['concat'] = clusteredData[columns].astype(str).agg(' '.join, axis=1)

In [8]:
allData = allData.rename(columns={'origin': 'originalSource'})

In [9]:
clusteredData = allData.sample(3000)
clusteredData

Unnamed: 0,concat,cluster_id,originalSource,cluster_id_mapped
10354,The Westin Princeville Ocean Resort Villas,3905,Hotel_kayak.com_September2020.json.gz,3904
57713,松井建築研究所,451,LocalBusiness_homify.co.th_September2020.json.gz,450
763714,Africa Travel,5022,LocalBusiness_ait-themes.club_September2020.js...,5021
251074,Public Restaurant,3366,Restaurant_chope.co_September2020.json.gz,3365
653467,Reiter Roofing Inc,4010,LocalBusiness_libertyroofingkc.com_September20...,4009
...,...,...,...,...
231207,American Shapewear,6050,LocalBusiness_americanshapewear.com_September2...,6049
93617,GreyHaze Wholesale,6050,LocalBusiness_greyhazewholesale.com_September2...,6049
778051,Badass Jewelry,6050,LocalBusiness_badassjewelry.com_September2020....,6049
456923,Press:Come cambiano le minacce all'hardware,2920,LocalBusiness_investigazioni-informatiche.it_S...,2919


### Combine tf-idf and tf vector based features

In [10]:
#clean concated description column to use tf-idf 
clusteredData['concat'] = clusteredData['concat'].apply(lambda row: row.lower())
clusteredData['tokens'] = clusteredData['concat'].apply(lambda row: word_tokenize(row))
clusteredData['tokens'] = remove_stopwords(clusteredData['tokens'],stopwords.words())
clusteredData['tokens'] = remove_punctuation (clusteredData['tokens'])
clusteredData.drop(columns=['concat'],inplace=True)
clusteredData = clusteredData[['tokens','cluster_id_mapped', 'originalSource']]

In [11]:
clusteredData

Unnamed: 0,tokens,cluster_id_mapped,originalSource
10354,"[westin, princeville, ocean, resort, villas]",3904,Hotel_kayak.com_September2020.json.gz
57713,[松井建築研究所],450,LocalBusiness_homify.co.th_September2020.json.gz
763714,"[africa, travel]",5021,LocalBusiness_ait-themes.club_September2020.js...
251074,"[public, restaurant]",3365,Restaurant_chope.co_September2020.json.gz
653467,"[reiter, roofing, inc]",4009,LocalBusiness_libertyroofingkc.com_September20...
...,...,...,...
231207,"[american, shapewear]",6049,LocalBusiness_americanshapewear.com_September2...
93617,"[greyhaze, wholesale]",6049,LocalBusiness_greyhazewholesale.com_September2...
778051,"[badass, jewelry]",6049,LocalBusiness_badassjewelry.com_September2020....
456923,"[press, cambiano, minacce, all'hardware]",2919,LocalBusiness_investigazioni-informatiche.it_S...


In [12]:
#define vectorizer to match preprocessed tokes for term frequency
def dummy(doc):
    return doc

vectorizer  = CountVectorizer(
        tokenizer=dummy,
        preprocessor=dummy,
        max_features=5000)  
tf_value = vectorizer.fit_transform(clusteredData['tokens'])

In [13]:
#define vectorizer to match preprocessed tokes
def dummy_fun(doc):
    return doc

tfidf = TfidfVectorizer(
    analyzer='word',
    tokenizer=dummy_fun,
    preprocessor=dummy_fun,
    token_pattern=None,
    max_features=5000)  
tfidf_value = tfidf.fit_transform(clusteredData['tokens'])

In [17]:
df_tf = pd.DataFrame(tf_value.toarray(), columns=vectorizer.get_feature_names())
df_tfidf = pd.DataFrame(tfidf_value.toarray(), columns=tfidf.get_feature_names())
df_prepared = pd.concat([clusteredData.reset_index(), df_tfidf], axis=1)
df_prepared

Unnamed: 0,index,tokens,cluster_id_mapped,originalSource,'','archiworkshop,'bond,'n,'office,'s,...,창조하우징,쿠알라룸푸르,퍼스트애비뉴,하노이,호텔,홈스타일토토,홍예디자인,＆,（株）ハウスインフォ,ｉｓｄアーキテクト／一級建築士事務所
0,10354,"[westin, princeville, ocean, resort, villas]",3904,Hotel_kayak.com_September2020.json.gz,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,57713,[松井建築研究所],450,LocalBusiness_homify.co.th_September2020.json.gz,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,763714,"[africa, travel]",5021,LocalBusiness_ait-themes.club_September2020.js...,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,251074,"[public, restaurant]",3365,Restaurant_chope.co_September2020.json.gz,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,653467,"[reiter, roofing, inc]",4009,LocalBusiness_libertyroofingkc.com_September20...,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2995,231207,"[american, shapewear]",6049,LocalBusiness_americanshapewear.com_September2...,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2996,93617,"[greyhaze, wholesale]",6049,LocalBusiness_greyhazewholesale.com_September2...,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2997,778051,"[badass, jewelry]",6049,LocalBusiness_badassjewelry.com_September2020....,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2998,456923,"[press, cambiano, minacce, all'hardware]",2919,LocalBusiness_investigazioni-informatiche.it_S...,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [84]:
y = df_prepared[['cluster_id_mapped', 'originalSource']]
df_prepared.drop(columns=['tokens','cluster_id_mapped'], inplace=True)

In [85]:
y

Unnamed: 0,cluster_id_mapped,originalSource
0,5073,LocalBusiness_ranking-selfstorage.pl_September...
1,6425,Hotel_adlibooking.ir_September2020.json.gz
2,6425,Hotel_iranhotel724.ir_September2020.json.gz
3,3321,Hotel_kayak.com.my_September2020.json.gz
4,4742,LocalBusiness_webdiamonds.us_September2020.jso...
...,...,...
49995,3360,Restaurant_viamichelin.at_September2020.json.gz
49996,1833,LocalBusiness_homify.co.uk_September2020.json.gz
49997,547,LocalBusiness_homify.pt_September2020.json.gz
49998,423,LocalBusiness_homify.pe_September2020.json.gz


In [86]:
y_train = y.loc[y['originalSource'].isin(trainData['origin'])]
y_test = y.loc[~y['originalSource'].isin(trainData['origin'])]
y_train = y_train['cluster_id_mapped']
y_test = y_test['cluster_id_mapped']

x_train = df_prepared.loc[df_prepared['originalSource'].isin(trainData['origin'])]
x_test = df_prepared.loc[~df_prepared['originalSource'].isin(trainData['origin'])]
x_train= x_train.drop(columns=['index', 'originalSource'])
x_test= x_test.drop(columns=['index', 'originalSource'])

In [1]:
y_train

NameError: name 'y_train' is not defined

In [88]:
# Baseline random forest
rf = RandomForestClassifier()
rf.fit(x_train,y_train)
prediction = rf.predict(x_test) 
f1_mic = f1_score(y_test,prediction,average='micro')
f1_mac = f1_score(y_test,prediction,average='macro')
accuracy = accuracy_score(y_test,prediction) 
precision = precision_score(y_test,prediction,average='micro') 
recall = recall_score(y_test,prediction,average='micro') 
precision_mac = precision_score(y_test,prediction,average='macro') 
recall_mac = recall_score(y_test,prediction,average='macro') 
print("The F1-Score micro on test set: {:.4f}".format(f1_mic))
print("The F1-Score macro on test set: {:.4f}".format(f1_mac))
print("The Precision on test set: {:.4f}".format(precision))
print("The Recall on test set: {:.4f}".format(recall))
print("The Precision macro on test set: {:.4f}".format(precision_mac))
print("The Recall macro on test set: {:.4f}".format(recall_mac))
print("The Accuracy-Score on test set: {:.4f}".format(accuracy))