# This notebook will give a first baseline estimation for the matching of entities via a random forest algorithm as multi-class classification

In [7]:
import os
import pandas as pd
import gzip
import json
import numpy as np
import nltk
from nltk.corpus import stopwords
import string
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

In [16]:
Path = r'../../Results/New_Concatenated_MatchingFile.gz'
LBData = []

with gzip.open(Path, 'r') as dataFile:
    for line in dataFile:
        lineData = json.loads(line.decode('utf-8'))
        LBData.append(lineData)
data = pd.DataFrame(LBData)

In [None]:
# get information about train and test table split
product_path = '../../../../src/data/product'
train_test_output_path = os.path.join(product_path, 'train_test_split/output_unfiltered_tables')
zip_files_train = [file for file in os.listdir(os.path.join(train_test_output_path, 'large/train')) if file.endswith('.json.gz')]
zip_files_val = [file for file in os.listdir(os.path.join(train_test_output_path, 'large/val')) if file.endswith('.json.gz')]
zip_files_test = [file for file in os.listdir(os.path.join(train_test_output_path, 'large/test')) if file.endswith('.json.gz')]

In [None]:
data_test = data[data['origin'].isin(zip_files_test)].reset_index()

In [17]:
data

Unnamed: 0,row_id,origin,name,address,page_url,telephone,addressregion,streetaddress,addresslocality,addresscountry,longitude,latitude,telephone_,phone_object,E.164 format,telephoneNorm
0,2,LocalBusiness_litmind.com_September2020.json.gz,Salvador Model Agency,"{'postalcode': '28001', 'addresslocality': 'Ma...",https://es.litmind.com/salvadormodelagency,+34 914310707,Madrid,"General Pardiñas, 34. 1º7ª",Madrid,ES,-3.6835849285126,40.423894105042,34914310707,"{'country_code': 34, 'extension': None, 'natio...",+34914310707,34914310707
1,10,LocalBusiness_litmind.com_September2020.json.gz,UNIC AZAFATAS S.L,"{'postalcode': '28250', 'addresscountry': 'ES'...",https://es.litmind.com/456459,+34 918599376,Madrid,C/ TORRENCINA 18,TORRELODONES,ES,-3.8939949,40.5808292,34918599376,"{'country_code': 34, 'extension': None, 'natio...",+34918599376,34918599376
2,16,LocalBusiness_litmind.com_September2020.json.gz,Coconut,"{'streetaddress': 'Calle Zabaleta, 10', 'addre...",https://es.litmind.com/coconutmadrid/info,+34 913776214,Madrid,"Calle Zabaleta, 10",Madrid,ES,-3.6744062,40.4413414,34913776214,"{'country_code': 34, 'extension': None, 'natio...",+34913776214,34913776214
3,18,LocalBusiness_litmind.com_September2020.json.gz,SDA MODEL'S INTERNATIONAL AGENCY,"{'postalcode': '28001', 'addresscountry': 'ES'...",https://es.litmind.com/sdamodelsintagency,+34 640847492,Madrid,ayala madrid n7 bajos,barcelona,ES,-3.6825650000001,40.427645,34640847492,"{'country_code': 34, 'extension': None, 'natio...",+34640847492,34640847492
4,20,LocalBusiness_litmind.com_September2020.json.gz,RAMONSERRANOPHOTO,"{'addressregion': 'Badajoz', 'addresslocality'...",https://es.litmind.com/ramonserranophoto-estudio,+34 636540326,Badajoz,"Avd. Constitucion, 2",Villafranca de los Barros,ES,-6.3428955,38.5616549,34636540326,"{'country_code': 34, 'extension': None, 'natio...",+34636540326,34636540326
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1495583,9968,Hotel_ihg.com_September2020.json.gz,,"{'postalcode': '32608', 'addresslocality': 'Ga...",https://www.ihg.com/destinations/de/de/united-...,1-352-3781300,FLORIDA,3370 S.W. 42nd Street,Gainesville,US,-8.238674639999999E1,2.962221619999999E1,13523781300,"{'country_code': 1, 'extension': None, 'nation...",+13523781300,13523781300
1495584,9975,Hotel_ihg.com_September2020.json.gz,,"{'postalcode': '30150-370', 'addresslocality':...",https://www.ihg.com/holidayinn/destinations/us...,55-31-30646555,,"Rua Professor Moraes, 600",Belo Horizonte - MG,BR,-4.3931513E1,-1.9938399E1,553130646555,"{'country_code': 55, 'extension': None, 'natio...",+553130646555,553130646555
1495585,9977,Hotel_ihg.com_September2020.json.gz,,"{'postalcode': 'A-9500', 'streetaddress': 'Eur...",https://www.ihg.com/voco/hotels/fr/fr/villach/...,43-4242-22522,,Europaplatz 1 - 2,Villach,AT,1.384929E1,4.661559E1,43424222522,"{'country_code': 43, 'extension': None, 'natio...",+43424222522,43424222522
1495586,9999,Hotel_ihg.com_September2020.json.gz,,"{'streetaddress': '12217 4th Street', 'address...",https://www.ihg.com/holidayinnexpress/destinat...,1-250-7827700,COLOMBIE BRITANNIQUE,12217 4th Street,Dawson Creek,CA,-1.20216293E2,5.5737782E1,12507827700,"{'country_code': 1, 'extension': None, 'nation...",+12507827700,12507827700


In [18]:
def remove_stopwords(token_vector, stopwords_list):
    return token_vector.apply(lambda token_list: [word for word in token_list if word not in stopwords_list])

In [19]:
def remove_punctuation(token_vector):
    return token_vector.apply(lambda token_list: [word for word in token_list if word not in string.punctuation])

In [36]:
clusters = data.groupby(['telephoneNorm']).size().reset_index(name='counts').sort_values('counts', ascending=False)
clusters = clusters.loc[clusters['counts'] > 1]
clusteredData = data[data['telephoneNorm'].isin(clusters['telephoneNorm'])]
clusteredData['ClusterID'] = clusteredData.groupby('telephoneNorm').ngroup()
columns = ['name', 'addressregion', 'streetaddress', 'addresslocality', 'addresscountry', 'longitude', 'latitude']
clusteredData['concat'] = clusteredData[columns].astype(str).agg(' '.join, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clusteredData['ClusterID'] = clusteredData.groupby('telephoneNorm').ngroup()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clusteredData['concat'] = clusteredData[columns].astype(str).agg(' '.join, axis=1)


In [37]:
clusteredData.head(1000)

Unnamed: 0,row_id,origin,name,address,page_url,telephone,addressregion,streetaddress,addresslocality,addresscountry,longitude,latitude,telephone_,phone_object,E.164 format,telephoneNorm,ClusterID,concat
29,18,LocalBusiness_sushipointer.com_September2020.j...,L'wzaar Sea Food Restaurant Doha,"{'addresscountry': 'Qatar', 'addressregion': '...",https://sushipointer.com/places/qatar/doha/doh...,+974 4408 0710,Doha,"L'wzaar Sea Food Restaurant, Katara Street",Doha,QA,51.52650117874146,25.358588885959527,97444080710,"{'country_code': 974, 'extension': None, 'nati...",+97444080710,97444080710,90672,L'wzaar Sea Food Restaurant Doha Doha L'wzaar ...
43,36,LocalBusiness_sushipointer.com_September2020.j...,Shiki Japanese Fine Dining | Brasserie | Bar V...,"{'postalcode': '1010', 'addressregion': 'Vienn...",https://sushipointer.com/places/austria/vienna...,+43 1 5127397,Vienna,Krugerstraße 3,Vienna,AT,16.371088,48.2041137,4315127397,"{'country_code': 43, 'extension': None, 'natio...",+4315127397,4315127397,1289,Shiki Japanese Fine Dining | Brasserie | Bar V...
47,40,LocalBusiness_sushipointer.com_September2020.j...,Restaurant Toshi Munich,"{'addresslocality': 'Munich', 'postalcode': '8...",https://sushipointer.com/places/germany/bavari...,+49 89 25546942,Bavaria,Wurzerstraße 18,Munich,DE,11.5824846,48.1391124,498925546942,"{'country_code': 49, 'extension': None, 'natio...",+498925546942,498925546942,106412,Restaurant Toshi Munich Bavaria Wurzerstraße 1...
52,46,LocalBusiness_sushipointer.com_September2020.j...,OKKU Fine Dining Dubai,"{'addressregion': 'Dubai', 'streetaddress': 'D...",https://sushipointer.com/places/united-arab-em...,+971 4 501 8777,Dubai,Dubai,Dubai,AE,55.28715133666992,25.229711782091815,97145018777,"{'country_code': 971, 'extension': None, 'nati...",+97145018777,97145018777,90121,OKKU Fine Dining Dubai Dubai Dubai Dubai AE 55...
61,62,LocalBusiness_sushipointer.com_September2020.j...,Restaurant EMIKO Munich,"{'addresslocality': 'Munich', 'postalcode': '8...",https://sushipointer.com/places/germany/bavari...,+49 89 4111908111,Bavaria,Viktualienmarkt 6,Munich,DE,11.5758719,48.1358655,49894111908111,"{'country_code': 49, 'extension': None, 'natio...",+49894111908111,49894111908111,117822,Restaurant EMIKO Munich Bavaria Viktualienmark...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3949,239,LocalBusiness_visitlakegeneva.com_September202...,Rushwood Park,"{'addresslocality': 'Lake Geneva', 'postalcode...",https://www.visitlakegeneva.com/listing/rushwo...,262-248-3673,WI,Timothy Drive,Lake Geneva,US,-8.84289221E1,4.25801521E1,2622483673,"{'country_code': 1, 'extension': None, 'nation...",+12622483673,12622483673,14224,Rushwood Park WI Timothy Drive Lake Geneva US ...
3950,240,LocalBusiness_visitlakegeneva.com_September202...,Gordy's Lakefront Marine,"{'postalcode': '53125', 'streetaddress': '320 ...",https://www.visitlakegeneva.com/listing/gordys...,262-275-2163,WI,320 Lake St.,Fontana,US,-8.8572882E1,4.2550004E1,2622752163,"{'country_code': 1, 'extension': None, 'nation...",+12622752163,12622752163,14263,Gordy's Lakefront Marine WI 320 Lake St. Fonta...
3952,242,LocalBusiness_visitlakegeneva.com_September202...,Grand Geneva Resort The Brute,"{'addresslocality': 'Lake Geneva', 'streetaddr...",https://www.visitlakegeneva.com/listing/grand-...,(262) 248-2556,WI,7036 Grand Geneva Way,Lake Geneva,US,-8.84051177E1,4.26097253E1,2622482556,"{'country_code': 1, 'extension': None, 'nation...",+12622482556,12622482556,14222,Grand Geneva Resort The Brute WI 7036 Grand Ge...
3953,243,LocalBusiness_visitlakegeneva.com_September202...,Lake Geneva Nails and Spa,"{'postalcode': '53147', 'addresscountry': 'UNI...",https://www.visitlakegeneva.com/listing/lake-g...,262-812-4081,WI,820 N. Edward Blvd.,lake Geneva,US,-8.84147432E1,4.26001691E1,2628124081,"{'country_code': 1, 'extension': None, 'nation...",+12628124081,12628124081,14408,Lake Geneva Nails and Spa WI 820 N. Edward Blv...


In [38]:
clusteredData = clusteredData.head(1000)

### Combine tf-idf and tf vector based features

In [39]:
#clean concated description column to use tf-idf 
clusteredData['concat'] = clusteredData['concat'].apply(lambda row: row.lower())
clusteredData['tokens'] = clusteredData['concat'].apply(lambda row: word_tokenize(row))
clusteredData['tokens'] = remove_stopwords(clusteredData['tokens'],stopwords.words())
clusteredData['tokens'] = remove_punctuation (clusteredData['tokens'])
clusteredData.drop(columns=['concat'],inplace=True)
clusteredData = clusteredData[['tokens','ClusterID']]

In [40]:
clusteredData

Unnamed: 0,tokens,ClusterID
29,"[l'wzaar, food, restaurant, doha, doha, l'wzaa...",90672
43,"[shiki, japanese, fine, dining, brasserie, bar...",1289
47,"[restaurant, toshi, munich, bavaria, wurzerstr...",106412
52,"[okku, fine, dining, dubai, dubai, dubai, duba...",90121
61,"[restaurant, emiko, munich, bavaria, viktualie...",117822
...,...,...
3949,"[rushwood, park, wi, timothy, drive, lake, gen...",14224
3950,"[gordy, 's, lakefront, marine, wi, 320, lake, ...",14263
3952,"[grand, geneva, resort, brute, wi, 7036, grand...",14222
3953,"[lake, geneva, nails, spa, wi, 820, n., edward...",14408


In [41]:
#define vectorizer to match preprocessed tokes for term frequency
def dummy(doc):
    return doc

vectorizer  = CountVectorizer(
        tokenizer=dummy,
        preprocessor=dummy,
        max_features=15000)  
tf_value = vectorizer.fit_transform(clusteredData['tokens'])

In [42]:
#define vectorizer to match preprocessed tokes
def dummy_fun(doc):
    return doc

tfidf = TfidfVectorizer(
    analyzer='word',
    tokenizer=dummy_fun,
    preprocessor=dummy_fun,
    token_pattern=None,
    max_features=15000)  
tfidf_value = tfidf.fit_transform(clusteredData['tokens'])

In [57]:
df_tf = pd.DataFrame(tf_value.toarray(), columns=vectorizer.get_feature_names())
df_tfidf = pd.DataFrame(tfidf_value.toarray(), columns=tfidf.get_feature_names())
df_prepared = pd.concat([clusteredData.reset_index(), df_tfidf, df_tf], axis=1)

In [60]:
y = df_prepared['ClusterID']
df_prepared.drop(columns=['tokens','ClusterID'], inplace=True)

In [62]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(df_prepared, y, test_size = 0.2)

In [63]:
# Baseline random forest
rf = RandomForestClassifier()
rf.fit(x_train,y_train)
prediction = rf.predict(x_test) 
f1_mic = f1_score(y_test,prediction,average='micro')
f1_mac = f1_score(y_test,prediction,average='macro')
accuracy = accuracy_score(y_test,prediction) 
precision = precision_score(y_test,prediction,average='micro') 
recall = recall_score(y_test,prediction,average='micro') 
precision_mac = precision_score(y_test,prediction,average='macro') 
recall_mac = recall_score(y_test,prediction,average='macro') 
print("The F1-Score micro on test set: {:.4f}".format(f1_mic))
print("The F1-Score macro on test set: {:.4f}".format(f1_mac))
print("The Precision on test set: {:.4f}".format(precision))
print("The Recall on test set: {:.4f}".format(recall))
print("The Precision macro on test set: {:.4f}".format(precision_mac))
print("The Recall macro on test set: {:.4f}".format(recall_mac))
print("The Accuracy-Score on test set: {:.4f}".format(accuracy))

The F1-Score micro on test set: 0.5250
The F1-Score macro on test set: 0.1354
The Precision on test set: 0.5250
The Recall on test set: 0.5250
The Precision macro on test set: 0.1314
The Recall macro on test set: 0.1429
The Accuracy-Score on test set: 0.5250


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
