In [None]:
import os
import sys
from os.path import dirname, abspath
import argparse
import pandas as pd
from EmbDI import data_preprocessing as dp

In [None]:
PATH = "C:/Users/Simo/Desktop/progettongdb/algoritmi/embdi/embdi-master"

ETL

Load data using pandas

In [None]:
df1 = pd.read_csv(PATH + "/pipeline/datasets/covid/bing_covid-19_data.csv", sep = ",", low_memory = False)
df2 = pd.read_csv(PATH + "/pipeline/datasets/covid/covid_tracking.csv", sep = ",", low_memory = False)

Keeping only the common values between the 2 datasets

In [None]:
df1 = df1.drop(df1[df1.iso3 != 'USA' ].index)
df1['iso_subdivision'] = df1['iso_subdivision'].str[3:]

# presa una tupla a caso per ogni stato della colonna iso2 per evitare di prendere i primi record di ogni stato ed effettuare un random sampling
df1 = df1.groupby(["iso_subdivision"]).sample(2)
df2 = df2.groupby(["state"]).sample(2)

# scrittura su disco dei ds
df1.to_csv(PATH + "/pipeline/datasets/covid/filtered-bing.csv", index = False)
df2.to_csv(PATH + "/pipeline/datasets/covid/filtered-covid_tracking.csv", index = False)

PREPARING DATA FOR EMBDI

Reading filtered datasets

In [None]:
f1 = PATH + "/pipeline/datasets/covid/filtered-bing.csv"
#df1 = pd.read_csv(f1, sep = ",")
for c in df1.columns:
    if df1[c].dtype == "object":
        df1[c] = df1[c].str.replace("_", " ")

f2 = PATH + "/pipeline/datasets/covid/filtered-covid_tracking.csv"
#df2 = pd.read_csv(f2, sep = ",")
for c in df2.columns:
    if df2[c].dtype == "object":
        df2[c] = df2[c].str.replace("_", " ")

Datasets concatenation with horizon parameter

In [None]:
parameters = {
    "output_file": "bing-covid_tracking",
    "concatenate": "horizon",
    "missing_value": "nan,ukn,none,unknown,-",
    "missing_value_strategy": "",
    "round_number": 1,
    "round_columns": "",
    "auto_merge": False,
    "expand_columns": "",
    "tokenize_shared": False,
}

df_c = dp.data_preprocessing([df1, df2], parameters)
df_c.to_csv(PATH + "/pipeline/datasets/covid/" + parameters["output_file"] + ".csv", index=False)

Creating and saving info file

In [None]:
dp.write_info_file([df1, df2],  PATH + "/pipeline/info/info-" + parameters["output_file"] + ".txt", [f1, f2])

Creating and saving edgelist of concatenated dataset

In [None]:
input_edgelist = "pipeline/datasets/covid/" + parameters["output_file"] + ".csv"
output_edgelist = "pipeline/er_edgelists/" + parameters["output_file"] + "_edgelist.txt"

command = "./EmbDI/edgelist.py" + " -i " + input_edgelist + " -o " + output_edgelist
print("command to paste on the cell below: " + command)

In [None]:
# paste command after "%run "
%run ./EmbDI/edgelist.py -i pipeline/datasets/covid/bing-covid_tracking.csv -o pipeline/er_edgelists/bing-covid_tracking_edgelist.txt

Manually creating and saving matches files (a.k.a. ground truth)

In [None]:
# insert your ground truth in "lines" variable
lines = [
    '0_iso_subdivision,1_state',
    '0_confirmed,1_positive'
    ]

with open('pipeline/matches/sm-matches/sm-matches-bing-covid_tracking.txt', 'w') as f:
    for line in lines:
        f.write(line)
        f.write('\n')

Configuring config file to perform training of embeddings and schema matching task

In [22]:
config = {

    # Input configuration:
    "task":"train-match",
    "input_file":"pipeline/er_edgelists/bing-covid_tracking_edgelist.txt",
    "output_file":"bing-covid_tracking-OUTPUT",
    "experiment_type":"SM",
    "match_file":"pipeline/matches/sm-matches/sm-matches-bing-covid_tracking.txt",
    "dataset_file":"pipeline/datasets/covid/bing-covid_tracking.csv",
    "test_dir":"pipeline/test_dir/covid",
    "dataset_info":"pipeline/info/info-bing-covid_tracking.txt",
    "embeddings_file":"",

    # Walks configuration:
    "sentence_length":"90",
    "n_sentences":"default",
    "follow_sub":"false",
    # {no, smooth, inverse_smooth, log, piecewise} 
    #"smoothing_method":"smooth,[k=0.2, target=200]",
    "smoothing_method":"no",
    "backtrack":"true",
    "write_walks":"true",
    "repl_numbers":"False",
    "repl_strings":"False",
    "flatten":"tt",
    "walks_strategy":"basic",

    # Embeddings configuration:
    "training_algorithm":"word2vec",
    "n_dimensions":"450",
    "window_size":"7",
    "learning_method":"skipgram",

    # Test configuration:
    "max_rank":"7",
    "ntop":"10",
    "ncand":"1",

    # Miscellaneous:
    "intersection":"false",
    "mlflow":"false"

}

with open('pipeline/config_files/reproducibility/schema_matching/train_config-covid_policy_tracker-sm', 'w') as f:
    for key, value in config.items(): 
        f.write('%s:%s\n' % (key, value))

Executing embeddings training and schema matching

In [23]:
%run main.py -f pipeline/config_files/reproducibility/schema_matching/train_config-covid_policy_tracker-sm



################################################################################
# File 1 out of 1
# Configuration file: train_config-covid_policy_tracker-sm
# Starting run................................................ 2023-01-27 19:40:06

# Starting graph construction................................. 2023-01-27 19:40:06
# Expanding columns: [tt].


# Loading edgelist_file.: 100%|██████████| 8554/8554 [00:00<00:00, 54648.45it/s]
# Preparing aliased randomizer.: 100%|██████████| 2292/2292 [00:00<00:00, 61937.66it/s]



# Graph construction complete................................. 2023-01-27 19:40:06
# Time required to build graph:............................... 0.20 seconds.
# 25466 sentences will be generated.
# Skipping search of overlapping values. 
# Generating basic random walks............................... 2023-01-27 19:40:06


# Sentence generation progress: 100%|██████████| 24372/24372 [00:03<00:00, 6120.24it/s]


# Completing fraction of random walks......................... 2023-01-27 19:40:10


# Sentence generation progress: 100%|██████████| 1094/1094 [00:00<00:00, 5864.97it/s]


# Generation of random walks completed........................ 2023-01-27 19:40:11

# Training embeddings......................................... 2023-01-27 19:40:11
# Writing embeddings in file: pipeline/embeddings/bing-covid_tracking-OUTPUT.emb
# Embeddings generation complete.............................. 2023-01-27 19:40:52
Extracting matched tuples


# ER - Finding node matches: 100%|██████████| 214/214 [00:00<00:00, 5557.50it/s]


# Time required to build sim struct: 0.14 seconds
# Time required to execute the ER task: 0.169531
Extracting matched columns
# Executing SM tests.
# Ending run.................................................. 2023-01-27 19:40:52
# Time required: 45.69 s





Configuring config file to perform test task

In [24]:
config = {

    # RIMUOVERE PARAMETRI NON NECESSARI PER IL TASK DI TESTING

    # Input configuration:
    "task":"test",
    "input_file":"pipeline/er_edgelists/bing-covid_tracking_edgelist.txt",
    "output_file":"bing-covid_tracking-OUTPUT",
    "experiment_type":"SM",
    "match_file":"pipeline/matches/sm-matches/sm-matches-bing-covid_tracking.txt",
    "dataset_file":"pipeline/datasets/covid/bing-covid_tracking.csv",
    "test_dir":"pipeline/test_dir/covid",
    "dataset_info":"pipeline/info/info-bing-covid_tracking.txt",
    "embeddings_file":"pipeline/embeddings/bing-covid_tracking-OUTPUT.emb",

    # Walks configuration:
    "sentence_length":"60",
    "n_sentences":"default",
    "follow_sub":"false",
    "smoothing_method":"no",
    "backtrack":"true",
    "write_walks":"true",
    "repl_numbers":"False",
    "repl_strings":"False",
    "flatten":"tt",
    "walks_strategy":"basic",

    # Embeddings configuration:
    "training_algorithm":"word2vec",
    "n_dimensions":"300",
    "window_size":"5",
    "learning_method":"skipgram",

    # Test configuration:
    "ntop":"10",
    "ncand":"1",

    # Miscellaneous:
    "intersection":"false",
    "mlflow":"false"

}

with open('pipeline/config_files/reproducibility/schema_matching/test_config-covid_policy_tracker-sm', 'w') as f:
    for key, value in config.items(): 
        f.write('%s:%s\n' % (key, value))

Executing test task

In [25]:
%run main.py -f pipeline/config_files/reproducibility/schema_matching/test_config-covid_policy_tracker-sm

################################################################################
# File 1 out of 1
# Configuration file: test_config-covid_policy_tracker-sm
# Starting run................................................ 2023-01-27 19:41:01

# SCHEMA MATCHING
# Executing SM tests.
Model built from file pipeline/embeddings/bing-covid_tracking-OUTPUT.emb
P	R	F
16.6667	100.0000	28.5714	
# Ending run.................................................. 2023-01-27 19:41:02
# Time required: 0.15 s
