# Verification

## 1. Modules and Libraries

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

import warnings
warnings.filterwarnings('ignore')


## 2. Load Data

* Train data: ProMapEn train set
* Test data: ProMapEn, ProMapCz, Amazon-Walmart, Amazon-Google test set

In [2]:
# load the train data
promapen_train_data = pd.read_csv("features/ProMapEn/promapen_train_similarities.csv")

# load the test data
# promapen test set is generated as part of this project
# The other three test sets are used as provided by the author of the implementation paper
promapen_test_data = pd.read_csv("features/ProMapEn/promapen_test_similarities.csv")
promapcz_test_data = pd.read_csv("datasets/ProMapCz/promapcz-test_data_similarities.csv")
am_wm_test_data = pd.read_csv("datasets/amazon-walmart/amazon_walmart-test_data_similarities.csv")
am_go_test_data = pd.read_csv("datasets/amazon-google/amazon_google-test_data_similarities.csv")


## 3. Split the data into features and labels

In [3]:
# Train features and labels
promapen_X_train = promapen_train_data.iloc[:, :-1]
promapen_y_train = promapen_train_data.iloc[:, -1]


# Test features and labels
promapen_X_test = promapen_test_data.iloc[:, :-1]
promapen_y_test = promapen_test_data.iloc[:, -1]

promapcz_X_test = promapcz_test_data.iloc[:, 2:-1]
promapcz_y_test = promapcz_test_data.iloc[:, -1]

am_wm_X_test = am_wm_test_data.iloc[:, 2:-1]
am_wm_y_test = am_wm_test_data.iloc[:, -1]

am_go_X_test = am_go_test_data.iloc[:, 2:-1]
am_go_y_test = am_go_test_data.iloc[:, -1]


In [4]:
def prepare_test_data(dataframe, features):
    """
    Renaming of columns and defaulting missing columns to zero
    Args:
        dataframe (pd.DataFrame): data
        features (list): names of the columns
    
    Return:
        dataframe: Ordered as per train fit
    """
    
    dataframe = dataframe.rename(columns = {"specification_key_matches": "specification_key",
                                            "specification_key_value_matches": "specification_key_value"})
    
    df_cols = dataframe.columns
    
    for feature in features:
        if feature not in df_cols:
            dataframe[feature] = 0
    
    return dataframe[features]

## 4. Train the Model with Best Params

* Model: MLP Classifier
* Best Parameters:
    * activation: tanh 
    * hidden_layer_sizes: (10, 50) 
    * learning_rate: constant
    * learning_rate_init: 0.01 
    * max_iter': 100 
    * solver: adam

In [5]:
nn_best_params = {'activation': 'tanh', 
                  'hidden_layer_sizes': (10, 50), 
                  'learning_rate': 'constant', 
                  'learning_rate_init': 0.01, 
                  'max_iter': 100, 
                  'solver': 'adam'}


nn_best_model = MLPClassifier(**nn_best_params, random_state=42)

nn_best_model.fit(promapen_X_train, promapen_y_train)



## 5.0 Test on Other Datasets

### 5.1 Train data: ProMapEn and Test data: ProMapEn

In [6]:
y_pred = nn_best_model.predict(promapen_X_test)

promapen_results = [accuracy_score(promapen_y_test, y_pred),
                    f1_score(promapen_y_test, y_pred),
                    precision_score(promapen_y_test, y_pred),
                    recall_score(promapen_y_test, y_pred)]

print("Train data: ProMapEn and Test data: ProMapEn")
print(f"Accuracy: {promapen_results[0]}")
print(f"F1 Score: {promapen_results[1]}")
print(f"Precision: {promapen_results[2]}")
print(f"Recall: {promapen_results[3]}")

Train data: ProMapEn and Test data: ProMapEn
Accuracy: 0.8006430868167203
F1 Score: 0.6593406593406593
Precision: 0.7407407407407407
Recall: 0.594059405940594


### 5.2 Train data: ProMapEn and Test data: ProMapCz

In [7]:
promapcz_X_test = prepare_test_data(promapcz_X_test, promapen_X_train.columns)

y_pred = nn_best_model.predict(promapcz_X_test)

promapcz_results = [accuracy_score(promapcz_y_test, y_pred),
                    f1_score(promapcz_y_test, y_pred),
                    precision_score(promapcz_y_test, y_pred),
                    recall_score(promapcz_y_test, y_pred)]

print("Train data: ProMapEn and Test data: ProMapCz")
print(f"Accuracy: {promapcz_results[0]}")
print(f"F1 Score: {promapcz_results[1]}")
print(f"Precision: {promapcz_results[2]}")
print(f"Recall: {promapcz_results[3]}")


Train data: ProMapEn and Test data: ProMapCz
Accuracy: 0.6421404682274248
F1 Score: 0.5836575875486381
Precision: 0.4716981132075472
Recall: 0.7653061224489796


### 5.3 Train data: ProMapEn and Test data: Amazon-Walmart

In [8]:
am_wm_X_test = prepare_test_data(am_wm_X_test, promapen_X_train.columns)

y_pred = nn_best_model.predict(am_wm_X_test)

am_wm_results = [accuracy_score(am_wm_y_test, y_pred),
                    f1_score(am_wm_y_test, y_pred),
                    precision_score(am_wm_y_test, y_pred),
                    recall_score(am_wm_y_test, y_pred)]

print("Train data: ProMapEn and Test data: Amazon Walmart")
print(f"A### 5.1 Train data: ProMapEn and Test data: ProMapEnccuracy: {am_wm_results[0]}")
print(f"F1 Score: {am_wm_results[1]}")
print(f"Precision: {am_wm_results[2]}")
print(f"Recall: {am_wm_results[3]}")


Train data: ProMapEn and Test data: Amazon Walmart
A### 5.1 Train data: ProMapEn and Test data: ProMapEnccuracy: 0.7678855325914149
F1 Score: 0.5350318471337578
Precision: 0.9767441860465116
Recall: 0.3684210526315789


### 5.4 Train data: ProMapEn and Test data: Amazon-Google

In [9]:
am_go_X_test = prepare_test_data(am_go_X_test, promapen_X_train.columns)

y_pred = nn_best_model.predict(am_go_X_test)

am_go_results = [accuracy_score(am_go_y_test, y_pred),
                    f1_score(am_go_y_test, y_pred),
                    precision_score(am_go_y_test, y_pred),
                    recall_score(am_go_y_test, y_pred)]

print("Train data: ProMapEn and Test data: Amazon Google")
print(f"Accuracy: {am_go_results[0]}")
print(f"F1 Score: {am_go_results[1]}")
print(f"Precision: {am_go_results[2]}")
print(f"Recall: {am_go_results[3]}")


Train data: ProMapEn and Test data: Amazon Google
Accuracy: 0.7527047913446677
F1 Score: 0.5604395604395604
Precision: 1.0
Recall: 0.3893129770992366


### 5.5 Result Analysis 

In [10]:
test_results = pd.DataFrame([promapen_results, promapcz_results, am_wm_results, am_go_results], 
                            columns=["Accuracy", "F1 Score", "Precision", "Recall"], 
                            index=["ProMapEn Test", "ProMapCz Test", "Am-Wm Test", "Am-Go Test"])
test_results = test_results.round({"Accuracy": 4, "F1 Score": 4, "Recall":4, "Precision": 4})
test_results[["F1 Score", "Precision", "Recall", "Accuracy"]]

Unnamed: 0,F1 Score,Precision,Recall,Accuracy
ProMapEn Test,0.6593,0.7407,0.5941,0.8006
ProMapCz Test,0.5837,0.4717,0.7653,0.6421
Am-Wm Test,0.535,0.9767,0.3684,0.7679
Am-Go Test,0.5604,1.0,0.3893,0.7527


In [11]:
test_results.to_csv("results/all-test-data.csv", header=True, index=True)