In [2]:
#!pip install rank-bm25
# from rank_bm25 import BM25Okapi
import pandas as pd
# import matplotlib.pyplot as plt
# import seaborn as sns
import os
from os import listdir
from os.path import isfile, join
import re
import numpy as np
from math import floor, ceil

import json
import gzip

from os import walk
from scipy.spatial import KDTree

# !pip install geopy
# !pip install phonenumbers
# !pip install pycountry

import geopy.distance
import phonenumbers
import pycountry

In [3]:
path = r"../../../../src/data"
lb_path_min3 = path + r"/LocalBusiness/LocalBusiness_minimum3/geo_preprocessed_v3"
lb_path_top100 = path + r"/LocalBusiness/LocalBusiness_top100/geo_preprocessed_v3"
rest_path_min3 = path + r"/Restaurant/Restaurant_minimum3/geo_preprocessed_v3"
rest_path_top100 = path + r"/Restaurant/Restaurant_top100/geo_preprocessed_v3"
hotel_path_min3 = path + r"/Hotel/Hotel_minimum3/geo_preprocessed_v3"
hotel_path_top100 = path + r"/Hotel/Hotel_top100/geo_preprocessed_v3"

In [4]:
file_path_list = [lb_path_min3, lb_path_top100, rest_path_min3, rest_path_top100, hotel_path_min3, hotel_path_top100]

for path in file_path_list:
    current = len(os.listdir(path))
    print(current)

35417
100
5037
100
7788
98


In [5]:
file_path_list = [lb_path_min3, lb_path_top100, rest_path_min3, rest_path_top100, hotel_path_min3, hotel_path_top100]


def create_df(file_path):
    files = os.listdir(file_path)
    df_as_list = []
    for lb in files:
        with gzip.open(file_path + '/' + lb, 'r') as dataFile:
            for line in dataFile:
                lineData = json.loads(line.decode('utf-8'))
                lineData["origin"] = lb
                df_as_list.append(lineData)
    df = pd.DataFrame(df_as_list)
    return df


print("creating df")
df_list = []
for file_path in file_path_list:
    df = create_df(file_path)
    df_list.append(df)

df_all = pd.concat(df_list, axis=0, ignore_index=True)

print("concatenation done")
print("Number of unique tables:", df_all['origin'].nunique())

creating df
concatenation done
Number of unique tables: 48540


In [6]:
# First Subset is to get only entries where a telephone nuumber is available
df_clean_phone = df_all[df_all["telephone"].notna()]
# Check number of tables left
print("Number of unique tables with telephone_number:", df_clean_phone["origin"].nunique())

df_clean_phone = df_clean_phone[
    ["row_id", "origin", "name", "address", "page_url", "telephone", "addressregion", "streetaddress",
     "addresslocality", "addresscountry", "longitude", "latitude"]]

Number of unique tables with telephone_number: 36067


In [7]:
df_clean_phone.head()

Unnamed: 0,row_id,origin,name,address,page_url,telephone,addressregion,streetaddress,addresslocality,addresscountry,longitude,latitude
52,1,LocalBusiness_litmind.com_September2020.json.gz,Galga Studio,"{'addresslocality': 'Madrid', 'addresscountry'...",https://es.litmind.com/galga_studio,+34,Madrid,calle Lorenzo Gonzalez 4,Madrid,ES,-3.6495914,40.4315119
53,2,LocalBusiness_litmind.com_September2020.json.gz,Salvador Model Agency,"{'postalcode': '28001', 'addresslocality': 'Ma...",https://es.litmind.com/salvadormodelagency,+34 914310707,Madrid,"General Pardiñas, 34. 1º7ª",Madrid,ES,-3.6835849285126,40.423894105042
56,5,LocalBusiness_litmind.com_September2020.json.gz,EFTI - Centro Internacional de Fotografía y Cine,,https://www.litmind.com/eftimasterfotografiamo...,+34 915529999,,,,,,
58,7,LocalBusiness_litmind.com_September2020.json.gz,Black&amp;berry Models,,https://www.litmind.com/blackandberrymodels,+34 666819078,,,,,,
59,8,LocalBusiness_litmind.com_September2020.json.gz,Yatepeino,,https://www.litmind.com/yatepeino/info,+93,,,,,,


In [8]:
# First round of parsing with raw data
# try parsing only from telephone_numbers

df_clean_phone_parse = df_clean_phone.copy()

def normalizer_2(entity):
    number = entity["telephone"]
    phone_number = phonenumbers.parse(number)
    return phone_number

df_clean_phone_parse.reset_index(inplace=True)
phone_objects = []
for row_index in df_clean_phone_parse.index:
    try:
        phone_object = normalizer_2(df_clean_phone_parse.iloc[row_index])
        phone_objects.append(phone_object)
    except:
        phone_objects.append("unknown")

In [9]:
# append to df_clean
df_clean_phone_parse["phone_object"] = pd.Series(phone_objects)

# save unkown rows where parser failed
unknown_rows = df_clean_phone_parse[df_clean_phone_parse["phone_object"] == "unknown"]
df_clean_phone_parse = df_clean_phone_parse.drop(unknown_rows.index)

# use phone package to isolate valid phone numbers
df_clean_phone_parse = df_clean_phone_parse[df_clean_phone_parse["phone_object"].apply(phonenumbers.is_valid_number)]
df_clean_phone_parse["E.164 format"] = df_clean_phone_parse["phone_object"].apply(
    lambda objects: phonenumbers.format_number(objects, phonenumbers.PhoneNumberFormat.E164))


In [10]:
# Have a look at parsed numbers
df_clean_phone_parse['telephoneNorm'] = df_clean_phone_parse['E.164 format'].str.replace('+', '').astype(np.int64)
df_clean_phone_parse["E.164 format"].value_counts().sort_values().tail(20)

  df_clean_phone_parse['telephoneNorm'] = df_clean_phone_parse['E.164 format'].str.replace('+', '').astype(np.int64)


+918004561000     2383
+85267718566      2393
+622139730888     2806
+74951334958      2853
+420222518768     2943
+421915427748     2967
+420606556615     3035
+40364431800      3268
+74956468503      3538
+919319111766     3709
+914440114081     3785
+421905383904     3863
+420725535406     3892
+18883836917      4371
+18885385188      4916
+14072333331      4918
+919313931393     6937
+18884082399      7363
+918076909847    13811
+78007750219     23168
Name: E.164 format, dtype: int64

In [11]:
df_clean_phone_parse.head()

Unnamed: 0,index,row_id,origin,name,address,page_url,telephone,addressregion,streetaddress,addresslocality,addresscountry,longitude,latitude,phone_object,E.164 format,telephoneNorm
1,53,2,LocalBusiness_litmind.com_September2020.json.gz,Salvador Model Agency,"{'postalcode': '28001', 'addresslocality': 'Ma...",https://es.litmind.com/salvadormodelagency,+34 914310707,Madrid,"General Pardiñas, 34. 1º7ª",Madrid,ES,-3.6835849285126,40.423894105042,Country Code: 34 National Number: 914310707,34914310707,34914310707
2,56,5,LocalBusiness_litmind.com_September2020.json.gz,EFTI - Centro Internacional de Fotografía y Cine,,https://www.litmind.com/eftimasterfotografiamo...,+34 915529999,,,,,,,Country Code: 34 National Number: 915529999,34915529999,34915529999
3,58,7,LocalBusiness_litmind.com_September2020.json.gz,Black&amp;berry Models,,https://www.litmind.com/blackandberrymodels,+34 666819078,,,,,,,Country Code: 34 National Number: 666819078,34666819078,34666819078
5,60,9,LocalBusiness_litmind.com_September2020.json.gz,Nomade MNGT,,https://es.litmind.com/nomade.mngt,+54 1143611490,,,,,,,Country Code: 54 National Number: 1143611490,541143611490,541143611490
6,61,10,LocalBusiness_litmind.com_September2020.json.gz,UNIC AZAFATAS S.L,"{'postalcode': '28250', 'addresscountry': 'ES'...",https://es.litmind.com/456459,+34 918599376,Madrid,C/ TORRENCINA 18,TORRELODONES,ES,-3.8939949,40.5808292,Country Code: 34 National Number: 918599376,34918599376,34918599376


In [12]:
df_clean_phone.head()

Unnamed: 0,row_id,origin,name,address,page_url,telephone,addressregion,streetaddress,addresslocality,addresscountry,longitude,latitude
52,1,LocalBusiness_litmind.com_September2020.json.gz,Galga Studio,"{'addresslocality': 'Madrid', 'addresscountry'...",https://es.litmind.com/galga_studio,+34,Madrid,calle Lorenzo Gonzalez 4,Madrid,ES,-3.6495914,40.4315119
53,2,LocalBusiness_litmind.com_September2020.json.gz,Salvador Model Agency,"{'postalcode': '28001', 'addresslocality': 'Ma...",https://es.litmind.com/salvadormodelagency,+34 914310707,Madrid,"General Pardiñas, 34. 1º7ª",Madrid,ES,-3.6835849285126,40.423894105042
56,5,LocalBusiness_litmind.com_September2020.json.gz,EFTI - Centro Internacional de Fotografía y Cine,,https://www.litmind.com/eftimasterfotografiamo...,+34 915529999,,,,,,
58,7,LocalBusiness_litmind.com_September2020.json.gz,Black&amp;berry Models,,https://www.litmind.com/blackandberrymodels,+34 666819078,,,,,,
59,8,LocalBusiness_litmind.com_September2020.json.gz,Yatepeino,,https://www.litmind.com/yatepeino/info,+93,,,,,,


In [13]:
# Now derive all rows where parsing failed
succesful_rows = list(df_clean_phone_parse["index"])
df_residual = df_clean_phone.drop(succesful_rows)

In [14]:
print(len(df_residual))
print(len(df_clean_phone_parse))

3898027
1007810


In [15]:
print(len(df_clean_phone))

4905837


In [16]:
# Define function for cleaning telephone numbers
def remove_non_digits(string):
    cleaned = re.sub('[^0-9]', '', string)
    return cleaned
df_residual['telephone_'] = df_residual['telephone'].astype('str').apply(remove_non_digits)

In [17]:
df_residual.head()

Unnamed: 0,row_id,origin,name,address,page_url,telephone,addressregion,streetaddress,addresslocality,addresscountry,longitude,latitude,telephone_
52,1,LocalBusiness_litmind.com_September2020.json.gz,Galga Studio,"{'addresslocality': 'Madrid', 'addresscountry'...",https://es.litmind.com/galga_studio,+34,Madrid,calle Lorenzo Gonzalez 4,Madrid,ES,-3.6495914,40.4315119,34
59,8,LocalBusiness_litmind.com_September2020.json.gz,Yatepeino,,https://www.litmind.com/yatepeino/info,+93,,,,,,,93
66,15,LocalBusiness_litmind.com_September2020.json.gz,Alessia Model Management,,https://cat.litmind.com/alessiamodels/models/men,+1809 8295859639,,,,,,,18098295859639
70,19,LocalBusiness_litmind.com_September2020.json.gz,Cidermill Films,,https://cat.litmind.com/cidermillfilms,+93,,,,,,,93
76,25,LocalBusiness_litmind.com_September2020.json.gz,Neukla web &amp; film,,https://es.litmind.com/capt.tousu,+34 00491708332529,,,,,,,3400491708332529


In [18]:
# Preprocessing for 2nd parsing
# now get the rows where we can use additional coutry code for parsing
#  
df_clean = df_residual[df_residual["addresscountry"].notna()]


In [19]:
# Normalize country codes for telephone package
countries = {}
for country in pycountry.countries:
    countries[country.name] = country.alpha_2


# fuction to modify the country dictionary in uppercase
def modify_dic(d):
    for key in list(d.keys()):
        new_key = key.upper()
        d[new_key] = d[key]
    return d


countries_upper = modify_dic(countries)

In [20]:
# uppercase the df_column
df_clean["addresscountry"] = df_clean["addresscountry"].str.upper()

# Replace known countries with ISO-2 format country code
for key, value in countries_upper.items():
    df_clean["addresscountry"] = df_clean["addresscountry"].str.replace(key, value)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean["addresscountry"] = df_clean["addresscountry"].str.upper()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean["addresscountry"] = df_clean["addresscountry"].str.replace(key, value)
  df_clean["addresscountry"] = df_clean["addresscountry"].str.replace(key, value)


In [21]:
# Manual country code replacement
country_dictionary = {
    "UNITED STATES": "US",
    "USA": "US",
    "UNITED KINGDOM": "GB",
    "UK": "GB",
    "CANADA": "CA",
    "AUSTRALIA": "AU",
    "UNITED ARAB EMIRATES": "AE",
    "UAE": "AE",
    "INDIA": "IN",
    "NEW ZEALAND": "NZ",
    "SVERIGE": "SE",
    "DEUTSCHLAND": "DE",
    "DEU": "DE",
    "RUSSIA": "RU",
    "ITALIA": "IT",
    "IRAN": "IR",
    ", IN": "IN",
    "ENGLAND": "GB",
    "FRA": "FR"
}
for key, value in country_dictionary.items():
    df_clean["addresscountry"] = df_clean["addresscountry"].str.replace(key, value)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean["addresscountry"] = df_clean["addresscountry"].str.replace(key, value)


In [22]:
print("phone_number normalizer")
def normalizer(entity):
    number = entity["telephone_"]
    address_country = entity["addresscountry"]
    phone_number = phonenumbers.parse(number, address_country)
    return phone_number

phone_number normalizer


In [23]:
df_clean.reset_index(inplace=True)
phone_objects = []
for row_index in df_clean.index:
    try:
        phone_object = normalizer(df_clean.iloc[row_index])
        phone_objects.append(phone_object)
    except:
        phone_objects.append("unknown")


In [24]:
print("normalizer successful")

normalizer successful


In [25]:
# append to df_clean
df_clean["phone_object"] = pd.Series(phone_objects)

# save unkown rows where parser failed
unknown_rows = df_clean[df_clean["phone_object"] == "unknown"]
df_clean = df_clean.drop(unknown_rows.index)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean["phone_object"] = pd.Series(phone_objects)


In [26]:
# use phone package to isolate valid phone numbers
df_clean = df_clean[df_clean["phone_object"].apply(phonenumbers.is_valid_number)]
df_clean["E.164 format"] = df_clean["phone_object"].apply(
    lambda objects: phonenumbers.format_number(objects, phonenumbers.PhoneNumberFormat.E164))


In [27]:
# Have a look at parsed numbers
df_clean['telephoneNorm'] = df_clean['E.164 format'].str.replace('+', '').astype(np.int64)
df_clean["E.164 format"].value_counts().sort_values().tail(20)


  df_clean['telephoneNorm'] = df_clean['E.164 format'].str.replace('+', '').astype(np.int64)


+18775784138       1347
+18886385044       1365
+34902901532       1444
+48507717950       1528
+525541641401      1671
+443458121007      1701
+917676202033      1799
+17136954939       2053
+491796910164      2059
+17156937123       2134
+18664773715       2179
+611300032861      2209
+611300695278      2970
+498912085467      3189
+917861004444      4051
+351936408200      5136
+9118601234569     5487
+18555161090       6955
+18884082399       7587
+33972196746      29080
Name: E.164 format, dtype: int64

In [28]:
# Now derive all rows where parsing failed
succesful_rows_2 = list(df_clean["index"])
df_residual_final = df_residual.drop(succesful_rows_2)

In [29]:
# LOGIC: df_residual_final + df_clean + df_clean_phone_parse must be equal to whole df_clena_phone

In [30]:
print("Creating Concatenated Dataframes")

Creating Concatenated Dataframes


In [31]:
### Concatenate dataframes
# First, the subsets which could be parsed with telephone package

df_parsed = pd.concat([df_clean_phone_parse, df_clean], axis=0)

In [34]:
# append a new column for later grouping

df_residual_final["telephoneNorm"] = df_residual_final["telephone_"]

In [40]:
# TESTING

first = len(df_residual_final)
second = len(df_clean_phone_parse)
third = len(df_clean)

total = first + second + third
print(total)
print(len(df_clean_phone))

4905837
4905837


In [46]:
### Final Concatenation

df_total = pd.concat([df_parsed, df_residual_final], axis=0)
df_total["telephoneNorm"].value_counts().sort_values().tail(20)

0948235300        4299
18883836917       4371
0                 4416
00                4543
33972722345       4865
18885385188       4916
14072333331       4918
0977773614        5142
8882057322        5237
351936408200      5259
9118601234569     5487
919313931393      6937
18555161090       6955
331730790630      9870
918076909847     13811
13126466365      14010
18884082399      14950
78007750219      23168
33972196746      29080
                 87937
Name: telephoneNorm, dtype: int64

In [67]:
### Some cleaning

df_total_clean = df_total[df_total["telephoneNorm"] != ""]
df_total_clean = df_total_clean[df_total_clean["telephoneNorm"] != "0"]
df_total_clean = df_total_clean[df_total_clean["telephoneNorm"] != "00"]

In [68]:
df_total_clean["telephoneNorm"].value_counts().sort_values().tail(20)

917861004444      4051
02037884841       4107
02145682000       4218
0948235300        4299
18883836917       4371
33972722345       4865
18885385188       4916
14072333331       4918
0977773614        5142
8882057322        5237
351936408200      5259
9118601234569     5487
919313931393      6937
18555161090       6955
331730790630      9870
918076909847     13811
13126466365      14010
18884082399      14950
78007750219      23168
33972196746      29080
Name: telephoneNorm, dtype: int64

In [69]:
df_total_clean.head()

Unnamed: 0,index,row_id,origin,name,address,page_url,telephone,addressregion,streetaddress,addresslocality,addresscountry,longitude,latitude,phone_object,E.164 format,telephoneNorm,telephone_
1,53.0,2,LocalBusiness_litmind.com_September2020.json.gz,Salvador Model Agency,"{'postalcode': '28001', 'addresslocality': 'Ma...",https://es.litmind.com/salvadormodelagency,+34 914310707,Madrid,"General Pardiñas, 34. 1º7ª",Madrid,ES,-3.6835849285126,40.423894105042,Country Code: 34 National Number: 914310707,34914310707,34914310707,
2,56.0,5,LocalBusiness_litmind.com_September2020.json.gz,EFTI - Centro Internacional de Fotografía y Cine,,https://www.litmind.com/eftimasterfotografiamo...,+34 915529999,,,,,,,Country Code: 34 National Number: 915529999,34915529999,34915529999,
3,58.0,7,LocalBusiness_litmind.com_September2020.json.gz,Black&amp;berry Models,,https://www.litmind.com/blackandberrymodels,+34 666819078,,,,,,,Country Code: 34 National Number: 666819078,34666819078,34666819078,
5,60.0,9,LocalBusiness_litmind.com_September2020.json.gz,Nomade MNGT,,https://es.litmind.com/nomade.mngt,+54 1143611490,,,,,,,Country Code: 54 National Number: 1143611490,541143611490,541143611490,
6,61.0,10,LocalBusiness_litmind.com_September2020.json.gz,UNIC AZAFATAS S.L,"{'postalcode': '28250', 'addresscountry': 'ES'...",https://es.litmind.com/456459,+34 918599376,Madrid,C/ TORRENCINA 18,TORRELODONES,ES,-3.8939949,40.5808292,Country Code: 34 National Number: 918599376,34918599376,34918599376,


In [71]:
df_total_clean["origin"].nunique()

35728

In [72]:
df_total_clean.to_json("19-12-21_MatchingFile", compression="gzip", orient='records', lines=True)
print("success")

success
