In [1]:
import pandas as pd
import numpy as np
import json
import gzip
import os
import concurrent.futures
import glob
import io
import seaborn as sns
import multiprocessing
from tqdm import tqdm
import mmap
from multiprocessing import Pool
from os import listdir

## Prepare Small code testing Train Set

In [4]:
path = '../../../src/data/schemafiltereddata/TrainTestTables/test_code'

In [5]:
files = [file for file in os.listdir(path+'/train/') if file.endswith('.json.gz')]

In [6]:
train = []
for file in files:
    filename = file
    df = pd.read_json(os.path.join(path+'/train/'+file), compression = 'gzip', lines=True)
    train.append({'key': filename,'table_data':file)

['Hotel_firsthotels.com_September2020.json.gz',
 'Event_yescorvallis.org_September2020.json.gz',
 'Person_bongacams4.com_September2020.json.gz',
 'Event_317area.com_September2020.json.gz',
 'Book_double-eye.com_September2020.json.gz',
 'Book_derbygpvts.co.uk_September2020.json.gz',
 'Hotel_find-around.com_September2020.json.gz',
 'Book_abebooks.co.uk_September2020.json.gz',
 'MusicRecording_9mileroots.com_September2020.json.gz',
 'CreativeWork_yambamarina.com.au_September2020.json.gz',
 'CreativeWork_10thcivicforum.com_September2020.json.gz',
 'CreativeWork_1000museums.com_September2020.json.gz',
 'LocalBusiness_fastappliancerepairsydney.com.au_September2020.json.gz',
 'CreativeWork_yucksauce.com_September2020.json.gz',
 'Person_bosmagibson.com_September2020.json.gz',
 'MusicRecording_2ugly2hold.com_September2020.json.gz',
 'LocalBusiness_foxfield-inn.com_September2020.json.gz']

In [1]:
labels_new = {'worstratingProduct': 0,
 'acceptsreservationsRestaurant': 1,
 'additionalnamePerson': 2,
 'additionalpropertyProduct': 3,
 'addressHotel': 4,
 'addressLocalBusiness': 5,
 'addressPerson': 6,
 'addressProduct': 7,
 'addressRestaurant': 8,
 'addresslocalityLocalBusiness': 9,
 'addressregionLocalBusiness': 10,
 'affiliationPerson': 11,
 'aggregateratingBook': 12,
 'aggregateratingCreativeWork': 13,
 'aggregateratingHotel': 14,
 'aggregateratingLocalBusiness': 15,
 'aggregateratingProduct': 16,
 'aggregateratingRecipe': 17,
 'aggregateratingRestaurant': 18,
 'alternatenameProduct': 19,
 'alternativeheadlineCreativeWork': 20,
 'areaservedLocalBusiness': 21,
 'articlebodyCreativeWork': 22,
 'audienceProduct': 23,
 'authorBook': 24,
 'authorCreativeWork': 25,
 'authorProduct': 26,
 'authorRecipe': 27,
 'availabilityProduct': 28,
 'bestratingProduct': 29,
 'birthdatePerson': 30,
 'birthplacePerson': 31,
 'bookeditionBook': 32,
 'bookformatBook': 33,
 'brandProduct': 34,
 'breadcrumbProduct': 35,
 'byartistMusicAlbum': 36,
 'byartistMusicRecording': 37,
 'categoryProduct': 38,
 'citystatezipLocalBusiness': 39,
 'colorProduct': 40,
 'commentcountCreativeWork': 41,
 'conditionProduct': 42,
 'contactpointLocalBusiness': 43,
 'contactpointPerson': 44,
 'cookingmethodRecipe': 45,
 'cooktimeRecipe': 46,
 'copyrightholderCreativeWork': 47,
 'copyrightyearCreativeWork': 48,
 'creatorCreativeWork': 49,
 'datecreatedCreativeWork': 50,
 'datecreatedMusicRecording': 51,
 'datemodifiedCreativeWork': 52,
 'datemodifiedRecipe': 53,
 'datepublishedBook': 54,
 'datepublishedCreativeWork': 55,
 'datepublishedMusicRecording': 56,
 'datepublishedProduct': 57,
 'datepublishedRecipe': 58,
 'deathdatePerson': 59,
 'depthProduct': 60,
 'disambiguatingdescriptionProduct': 61,
 'doortimeEvent': 62,
 'durationEvent': 63,
 'durationMusicRecording': 64,
 'emailHotel': 65,
 'emailLocalBusiness': 66,
 'emailPerson': 67,
 'enddateEvent': 68,
 'episodenumberTVEpisode': 69,
 'worksforPerson': 70,
 'familynamePerson': 71,
 'faxnumberLocalBusiness': 72,
 'faxnumberPerson': 73,
 'genderPerson': 74,
 'genreBook': 75,
 'genreCreativeWork': 76,
 'genreMusicRecording': 77,
 'geoHotel': 78,
 'geoLocalBusiness': 79,
 'geoPlace': 80,
 'geoProduct': 81,
 'geoRestaurant': 82,
 'givennamePerson': 83,
 'gtin12Product': 84,
 'gtin13Product': 85,
 'gtin14Product': 86,
 'gtin8Product': 87,
 'gtinProduct': 88,
 'hasmapLocalBusiness': 89,
 'hasmenuRestaurant': 90,
 'headlineCreativeWork': 91,
 'headlineRecipe': 92,
 'heightPerson': 93,
 'heightProduct': 94,
 'homelocationPerson': 95,
 'identifierProduct': 96,
 'worklocationPerson': 97,
 'ingredientsRecipe': 98,
 'inlanguageBook': 99,
 'inlanguageCreativeWork': 100,
 'interactioncountCreativeWork': 101,
 'interactionstatisticCreativeWork': 102,
 'interactiontypeCreativeWork': 103,
 'isbnBook': 104,
 'ispartofRecipe': 105,
 'isrelatedtoProduct': 106,
 'issimilartoProduct': 107,
 'itemconditionProduct': 108,
 'itemlistelementProduct': 109,
 'jobtitlePerson': 110,
 'keywordsCreativeWork': 111,
 'keywordsRecipe': 112,
 'knowslanguagePerson': 113,
 'legalnameLocalBusiness': 114,
 'locationEvent': 115,
 'locationLocalBusiness': 116,
 'mainentityofpageCreativeWork': 117,
 'mainentityofpagePerson': 118,
 'mainentityofpageProduct': 119,
 'mainentityofpageRecipe': 120,
 'makesofferPerson': 121,
 'manufacturerProduct': 122,
 'materialProduct': 123,
 'memberofPerson': 124,
 'menuRestaurant': 125,
 'modelProduct': 126,
 'mpnProduct': 127,
 'nameBook': 128,
 'nameCreativeWork': 129,
 'nameEvent': 130,
 'nameHotel': 131,
 'nameLocalBusiness': 132,
 'nameMusicAlbum': 133,
 'nameMusicRecording': 134,
 'namePlace': 135,
 'nameProduct': 136,
 'nameRecipe': 137,
 'nameRestaurant': 138,
 'nameTVEpisode': 139,
 'nationalityPerson': 140,
 'numberofpagesBook': 141,
 'numtracksMusicAlbum': 142,
 'nutritionRecipe': 143,
 'weightProduct': 144,
 'offersBook': 145,
 'offersCreativeWork': 146,
 'offersProduct': 147,
 'openinghoursLocalBusiness': 148,
 'openinghoursRestaurant': 149,
 'openinghoursspecificationLocalBusiness': 150,
 'openinghoursspecificationPlace': 151,
 'openinghoursspecificationRestaurant': 152,
 'organizerEvent': 153,
 'partofseriesTVEpisode': 154,
 'paymentacceptedLocalBusiness': 155,
 'widthProduct': 156,
 'performersEvent': 157,
 'performtimeRecipe': 158,
 'postalcodeLocalBusiness': 159,
 'preptimeRecipe': 160,
 'priceProduct': 161,
 'pricecurrencyProduct': 162,
 'pricerangeHotel': 163,
 'pricerangeLocalBusiness': 164,
 'pricerangeRestaurant': 165,
 'publisherBook': 166,
 'publisherCreativeWork': 167,
 'publisherRecipe': 168,
 'ratingvalueProduct': 169,
 'recipecategoryRecipe': 170,
 'recipecuisineRecipe': 171,
 'recipeingredientRecipe': 172,
 'recipeinstructionsRecipe': 173,
 'recipeyieldRecipe': 174,
 'releasedateProduct': 175,
 'reviewLocalBusiness': 176,
 'reviewProduct': 177,
 'reviewRecipe': 178,
 'reviewcountProduct': 179,
 'reviewsProduct': 180,
 'sameasLocalBusiness': 181,
 'sameasPerson': 182,
 'sameasPlace': 183,
 'servescuisineRestaurant': 184,
 'shop-currencyProduct': 185,
 'starratingHotel': 186,
 'streetaddressLocalBusiness': 187,
 'suitablefordietRecipe': 188,
 'telephoneHotel': 189,
 'telephoneLocalBusiness': 190,
 'telephonePerson': 191,
 'telephonePlace': 192,
 'telephoneRestaurant': 193,
 'titleProduct': 194,
 'totaltimeRecipe': 195,
 'trackMusicAlbum': 196,
 'typicalagerangeEvent': 197,
 'versionCreativeWork': 198,
 'weightPerson': 199,
 'offerdetailsProduct': 200,
 'founderLocalBusiness': 201}

In [7]:
labels_new.items()
labels=[]
for idx, type in labels_new.items():
    temp = [idx,type]
    labels.append(temp)

In [12]:
labels
labels = pd.DataFrame(labels)

In [17]:
labels = labels.rename(columns={'idx':'type','type':'idx'})

In [18]:
labels.to_csv('../../../src/data/schemafiltereddata/tabbie/sato/labels.csv', index=False)

In [41]:
labels = pd.read_csv('../../../src/data/schemafiltereddata/tabbie/sato/labels.csv')
labels = labels['type'].to_list()
labels = set(labels)
labels

{'acceptsreservationsRestaurant',
 'additionalnamePerson',
 'additionalpropertyProduct',
 'addressHotel',
 'addressLocalBusiness',
 'addressPerson',
 'addressProduct',
 'addressRestaurant',
 'addresslocalityLocalBusiness',
 'addressregionLocalBusiness',
 'affiliationPerson',
 'aggregateratingBook',
 'aggregateratingCreativeWork',
 'aggregateratingHotel',
 'aggregateratingLocalBusiness',
 'aggregateratingProduct',
 'aggregateratingRecipe',
 'aggregateratingRestaurant',
 'alternatenameProduct',
 'alternativeheadlineCreativeWork',
 'areaservedLocalBusiness',
 'articlebodyCreativeWork',
 'audienceProduct',
 'authorBook',
 'authorCreativeWork',
 'authorProduct',
 'authorRecipe',
 'availabilityProduct',
 'bestratingProduct',
 'birthdatePerson',
 'birthplacePerson',
 'bookeditionBook',
 'bookformatBook',
 'brandProduct',
 'breadcrumbProduct',
 'byartistMusicAlbum',
 'byartistMusicRecording',
 'categoryProduct',
 'citystatezipLocalBusiness',
 'colorProduct',
 'commentcountCreativeWork',
 'cond

In [74]:
# path = '/work-ceph/bizer-tp2021/data_integration_using_deep_learning/src/data/data/CSV_files'
# stats = pd.read_csv(path + '/all_files_stats.csv')
# files = pd.read_csv(path + '/all_files_cleaned.csv')
# files = files.drop(columns={'Unnamed: 0'})
# #display(files)
# listcol= files.groupby('filename')['column_name'].apply(list).reset_index(name='listofcolumns')
# listcol.filename = listcol.filename.str.split('_', expand=True)[0]
# listcol.index = listcol['filename']
# listcol
# listcolt = listcol
listcolt #= listcolt[(listcolt['filename']=='TVEpisode') | (listcolt['filename']=='MusicAlbum')]
c = []
for items in listcolt['listofcolumns'][cols]:
        c.append(items+ cols)
        #print(c)
c = set(c)
diff = c - labels
diff = list(diff)
diff

[]

In [75]:
listcolt.to_csv('/work-ceph/bizer-tp2021/data_integration_using_deep_learning/src/data/data/CSV_files/selected_cols.csv', index=False)

# Start HERE

In [2]:
import ast
list = pd.read_csv('/work-ceph/bizer-tp2021/data_integration_using_deep_learning/src/data/data/CSV_files/selected_cols.csv')
list = list.set_index('filename')
for cols in list.index:
    list.loc[cols]['listofcolumns'] = ast.literal_eval(list.loc[cols]['listofcolumns'])
list

Unnamed: 0_level_0,listofcolumns
filename,Unnamed: 1_level_1
Book,"[name, author, isbn, publisher, offers, datepu..."
CreativeWork,"[headline, author, datepublished, datemodified..."
Event,"[name, enddate, location, organizer, duration,..."
Hotel,"[name, address, telephone, pricerange, geo, ag..."
LocalBusiness,"[name, address, telephone, geo, pricerange, ag..."
MusicAlbum,"[name, byartist, track, numtracks]"
MusicRecording,"[name, duration, byartist, datepublished, genr..."
Person,"[mainentityofpage, jobtitle, birthdate, givenn..."
Place,"[name, geo, telephone, sameas, openinghoursspe..."
Product,"[name, offers, brand, aggregaterating, mpn, ca..."


In [3]:
labels = pd.read_csv('../../../src/data/schemafiltereddata/tabbie/sato/labels.csv')
labels = labels.set_index('type')
labels

Unnamed: 0_level_0,idx
type,Unnamed: 1_level_1
worstratingProduct,0
acceptsreservationsRestaurant,1
additionalnamePerson,2
additionalpropertyProduct,3
addressHotel,4
...,...
typicalagerangeEvent,197
versionCreativeWork,198
weightPerson,199
offerdetailsProduct,200


In [4]:
path = '../../../src/data/schemafiltereddata/TrainTestTables/test_code'
files = [file for file in os.listdir(path+'/train/') if file.endswith('.json.gz')]
data=[]
c = 0
for file in files:
    df = pd.read_json(os.path.join(path+'/train/'+file), compression = 'gzip', lines=True)
    filename = file
    withoutheader = df.values.tolist()
    # print(withoutheader)
    index = file.split('_')[0]
    setcol = set(list.loc[index]['listofcolumns'])
    #print(setcol)
    c = c+1
    setfile = set(df.columns)
    #print(setfile)
    cols = setfile & setcol
    #display(cols)
    list_col = sorted(cols)
    train_small= []
    lab= []
    for cols in list_col:
        num = df.columns.get_loc(cols)
        label_idx = labels.loc[cols+index]['idx']
        lab.append(label_idx)
        train_small.append(num)
    #dict.update()
    data.append({"key":filename, "table_data":withoutheader, "id":filename, "field_list":train_small, "field_names":lab, "col_idx":train_small, "label_idx":lab})
data

[{'key': 'Hotel_firsthotels.com_September2020.json.gz',
  'table_data': [[0,
    'First Hotel Kramm',
    {'streetaddress': 'Torggatan 14',
     'email': 'kramm.reception@firsthotels.se',
     'addresscountry': 'Sweden',
     'postalcode': '872 30',
     'addresslocality': 'Kramfors',
     'telephone': '+46 612 77 13 30'},
    'First Hotel Kramm is located close to the centre of Kramfors and offers the perfect starting point for day trips to the famous Höga Kusten archipelago, as well as other scenic destinations. The hotel is easily accessible as there is a travel hub close by with bus and train connections. The hotel is also close to Höga Kusten Airport. We offer 98 comfortable guest rooms and state-of-the-art meeting facilities with capacity for up to 200 guests. In our restaurant and bar, our focus is on quality and we use local products and innovative thinking with a traditional spirit. There is a car park right outside the hotel entrance.',
    '0 -',
    'https://firsthotelsiv.a

In [6]:
class NpEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        if isinstance(obj, np.floating):
            return float(obj)
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        return super(NpEncoder, self).default(obj)

In [7]:
import json
with open("train_schema.jsonl", 'w') as f:
    for item in data:
        f.write(json.dumps(item, cls=NpEncoder) + "\n")

path = '../../../src/data/schemafiltereddata/TrainTestTables/test_code'
files = [file for file in os.listdir(path+'/train/') if file.endswith('.json.gz')]
data=[]
for file in files:
    df = pd.read_json(os.path.join(path+'/train/'+file), compression = 'gzip', lines=True)
    filename = file
    withoutheader = df.iloc[:].values
    index = file.split('_')[0]
    print(index)
    setcol = set(listcolt.loc[index]['listofcolumns'])
    c = c+1
    #print(setcol)
    setfile = set(df.columns)
    cols = setfile & setcol
    list_col = list(cols)
    train_small= []
    for cols in list_col:
        num = df.columns.get_loc(cols)
        train_small = train_small.append({'col_id':num}, ignore_index=True)
    data.append({"key":filename, "table_data":withoutheader, "col_idx":train_small})

jsonData=json.dumps(data)

In [9]:
path = '../../../src/data/schemafiltereddata/TrainTestTables/test_code'
files = [file for file in os.listdir(path+'/train/') if file.endswith('.json.gz')]
file=files[1]
df = pd.read_json(os.path.join(path+'/train/'+file), compression = 'gzip', lines=True)
range(df.shape[1])

range(0, 9)

In [12]:
s1=df.iloc[:].values

In [13]:
s1

array([[0, '2017-11-01T08:30:00-07:00',
        'Holiday Digital Marketing Seminar Livestream',
        '2017-11-01T11:00:00-07:00',
        '&lt;p&gt;As part of the U.S. Small Business Administration Tech Coalition - Google, Constant Contact, Facebook and Square are teaming up to create a marketing wonderland, in time to help businesses for the holidays. HPR Digital Marketing is pleased to be hosting a livestream workshop to show small businesses how to be found when this season’s [&hellip;]&lt;/p&gt;\\',
        None, 'Organization',
        {'name': 'Corvallis-Benton Economic Development Office'},
        'https://yescorvallis.org/event/holiday-digital-marketing-seminar-livestream/'],
       [1, '2017-06-27T14:00:00-07:00', 'Guided Tour of QuickBooks',
        '2017-06-28T16:00:00-07:00',
        '&lt;p&gt;Two-day class to introduce you to QuickBooks! 2-4pm each day Call 541-917-4929 to register $89&lt;/p&gt;\\',
        {'telephone': '0', 'name': 'LBCC SBDC'}, 'Organization',
     